Colab Setup¶
In [1]:
import sys
IS_COLAB = 'google.colab' in sys.modules
print(f"Running in Google Colab: {IS_COLAB}")
Running in Google Colab: True
In [2]:
import os
import sys
if IS_COLAB:
print("Running in Google Colab environment.")
if os.path.exists('/content/aai521_3proj'):
print("Repository already exists. Pulling latest changes...")
%cd /content/aai521_3proj
!git pull
else:
print("Cloning repository...")
!git clone https://github.com/swapnilprakashpatil/aai521_3proj.git
%cd aai521_3proj
%pip install -r requirements.txt
sys.path.append('/content/aai521_3proj/src')
%ls
else:
print("Running in local environment. Installing packages...")
%pip install -r ../requirements.txt
sys.path.append('../src')
Running in Google Colab environment. Repository already exists. Pulling latest changes... /content/aai521_3proj Already up to date. Requirement already satisfied: numpy>=1.24.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 2)) (2.0.2) Requirement already satisfied: pandas>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 3)) (2.2.2) Requirement already satisfied: scikit-learn>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 4)) (1.6.1) Requirement already satisfied: scipy>=1.11.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 5)) (1.16.3) Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 8)) (2.9.0+cu126) Requirement already satisfied: torchvision>=0.15.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 9)) (0.24.0+cu126) Requirement already satisfied: segmentation-models-pytorch>=0.3.3 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 10)) (0.5.0) Requirement already satisfied: albumentations>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 11)) (2.0.8) Requirement already satisfied: opencv-python>=4.8.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 14)) (4.12.0.88) Requirement already satisfied: scikit-image>=0.21.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 15)) (0.25.2) Requirement already satisfied: Pillow>=10.0.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 16)) (11.3.0) Requirement already satisfied: rasterio>=1.3.8 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 19)) (1.4.3) Requirement already satisfied: geopandas>=0.13.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 20)) (1.1.1) Requirement already satisfied: shapely>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 21)) (2.1.2) Requirement already satisfied: matplotlib>=3.7.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 24)) (3.10.0) Requirement already satisfied: seaborn>=0.12.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 25)) (0.13.2) Requirement already satisfied: tqdm>=4.65.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 28)) (4.67.1) Requirement already satisfied: tensorboard>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 31)) (2.19.0) Requirement already satisfied: jupyter>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 34)) (1.1.1) Requirement already satisfied: ipykernel>=6.25.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 35)) (7.1.0) Requirement already satisfied: ipywidgets>=8.1.0 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 36)) (8.1.8) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas>=2.0.0->-r requirements.txt (line 3)) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas>=2.0.0->-r requirements.txt (line 3)) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=2.0.0->-r requirements.txt (line 3)) (2025.2) Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn>=1.3.0->-r requirements.txt (line 4)) (1.5.2) Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn>=1.3.0->-r requirements.txt (line 4)) (3.6.0) Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (3.20.0) Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (4.15.0) Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (75.2.0) Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (1.14.0) Requirement already satisfied: networkx>=2.5.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (3.6) Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (3.1.6) Requirement already satisfied: fsspec>=0.8.5 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (2025.3.0) Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.6.77) Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.6.77) Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.6.80) Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (9.10.2.21) Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.6.4.1) Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (11.3.0.4) Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (10.3.7.77) Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (11.7.1.2) Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.5.4.2) Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (0.7.1) Requirement already satisfied: nvidia-nccl-cu12==2.27.5 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (2.27.5) Requirement already satisfied: nvidia-nvshmem-cu12==3.3.20 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (3.3.20) Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.6.77) Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (12.6.85) Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (1.11.1.6) Requirement already satisfied: triton==3.5.0 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->-r requirements.txt (line 8)) (3.5.0) Requirement already satisfied: huggingface-hub>=0.24 in /usr/local/lib/python3.12/dist-packages (from segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (0.36.0) Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.12/dist-packages (from segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (0.7.0) Requirement already satisfied: timm>=0.9 in /usr/local/lib/python3.12/dist-packages (from segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (1.0.22) Requirement already satisfied: PyYAML in /usr/local/lib/python3.12/dist-packages (from albumentations>=1.3.1->-r requirements.txt (line 11)) (6.0.3) Requirement already satisfied: pydantic>=2.9.2 in /usr/local/lib/python3.12/dist-packages (from albumentations>=1.3.1->-r requirements.txt (line 11)) (2.12.3) Requirement already satisfied: albucore==0.0.24 in /usr/local/lib/python3.12/dist-packages (from albumentations>=1.3.1->-r requirements.txt (line 11)) (0.0.24) Requirement already satisfied: opencv-python-headless>=4.9.0.80 in /usr/local/lib/python3.12/dist-packages (from albumentations>=1.3.1->-r requirements.txt (line 11)) (4.12.0.88) Requirement already satisfied: stringzilla>=3.10.4 in /usr/local/lib/python3.12/dist-packages (from albucore==0.0.24->albumentations>=1.3.1->-r requirements.txt (line 11)) (4.2.3) Requirement already satisfied: simsimd>=5.9.2 in /usr/local/lib/python3.12/dist-packages (from albucore==0.0.24->albumentations>=1.3.1->-r requirements.txt (line 11)) (6.5.3) Requirement already satisfied: imageio!=2.35.0,>=2.33 in /usr/local/lib/python3.12/dist-packages (from scikit-image>=0.21.0->-r requirements.txt (line 15)) (2.37.2) Requirement already satisfied: tifffile>=2022.8.12 in /usr/local/lib/python3.12/dist-packages (from scikit-image>=0.21.0->-r requirements.txt (line 15)) (2025.10.16) Requirement already satisfied: packaging>=21 in /usr/local/lib/python3.12/dist-packages (from scikit-image>=0.21.0->-r requirements.txt (line 15)) (25.0) Requirement already satisfied: lazy-loader>=0.4 in /usr/local/lib/python3.12/dist-packages (from scikit-image>=0.21.0->-r requirements.txt (line 15)) (0.4) Requirement already satisfied: affine in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (2.4.0) Requirement already satisfied: attrs in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (25.4.0) Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (2025.11.12) Requirement already satisfied: click>=4.0 in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (8.3.1) Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (0.7.2) Requirement already satisfied: click-plugins in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (1.1.1.2) Requirement already satisfied: pyparsing in /usr/local/lib/python3.12/dist-packages (from rasterio>=1.3.8->-r requirements.txt (line 19)) (3.2.5) Requirement already satisfied: pyogrio>=0.7.2 in /usr/local/lib/python3.12/dist-packages (from geopandas>=0.13.0->-r requirements.txt (line 20)) (0.11.1) Requirement already satisfied: pyproj>=3.5.0 in /usr/local/lib/python3.12/dist-packages (from geopandas>=0.13.0->-r requirements.txt (line 20)) (3.7.2) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.7.0->-r requirements.txt (line 24)) (1.3.3) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.7.0->-r requirements.txt (line 24)) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.7.0->-r requirements.txt (line 24)) (4.60.1) Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.7.0->-r requirements.txt (line 24)) (1.4.9) Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (1.4.0) Requirement already satisfied: grpcio>=1.48.2 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (1.76.0) Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (3.10) Requirement already satisfied: protobuf!=4.24.0,>=3.19.6 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (5.29.5) Requirement already satisfied: six>1.9 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (1.17.0) Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (0.7.2) Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from tensorboard>=2.13.0->-r requirements.txt (line 31)) (3.1.3) Requirement already satisfied: notebook in /usr/local/lib/python3.12/dist-packages (from jupyter>=1.0.0->-r requirements.txt (line 34)) (7.5.0) Requirement already satisfied: jupyter-console in /usr/local/lib/python3.12/dist-packages (from jupyter>=1.0.0->-r requirements.txt (line 34)) (6.6.3) Requirement already satisfied: nbconvert in /usr/local/lib/python3.12/dist-packages (from jupyter>=1.0.0->-r requirements.txt (line 34)) (7.16.6) Requirement already satisfied: jupyterlab in /usr/local/lib/python3.12/dist-packages (from jupyter>=1.0.0->-r requirements.txt (line 34)) (4.5.0) Requirement already satisfied: comm>=0.1.1 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.2.3) Requirement already satisfied: debugpy>=1.6.5 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (1.8.15) Requirement already satisfied: ipython>=7.23.1 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (7.34.0) Requirement already satisfied: jupyter-client>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (8.6.3) Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (5.9.1) Requirement already satisfied: matplotlib-inline>=0.1 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.2.1) Requirement already satisfied: nest-asyncio>=1.4 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (1.6.0) Requirement already satisfied: psutil>=5.7 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (5.9.5) Requirement already satisfied: pyzmq>=25 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (26.2.1) Requirement already satisfied: tornado>=6.2 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (6.5.1) Requirement already satisfied: traitlets>=5.4.0 in /usr/local/lib/python3.12/dist-packages (from ipykernel>=6.25.0->-r requirements.txt (line 35)) (5.7.1) Requirement already satisfied: widgetsnbextension~=4.0.14 in /usr/local/lib/python3.12/dist-packages (from ipywidgets>=8.1.0->-r requirements.txt (line 36)) (4.0.15) Requirement already satisfied: jupyterlab_widgets~=3.0.15 in /usr/local/lib/python3.12/dist-packages (from ipywidgets>=8.1.0->-r requirements.txt (line 36)) (3.0.16) Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24->segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (2.32.4) Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub>=0.24->segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (1.2.0) Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.19.2) Requirement already satisfied: decorator in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (4.4.2) Requirement already satisfied: pickleshare in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.7.5) Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (3.0.52) Requirement already satisfied: pygments in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (2.19.2) Requirement already satisfied: backcall in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.2.0) Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.12/dist-packages (from ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (4.9.0) Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.12/dist-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=6.25.0->-r requirements.txt (line 35)) (4.5.0) Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic>=2.9.2->albumentations>=1.3.1->-r requirements.txt (line 11)) (0.7.0) Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic>=2.9.2->albumentations>=1.3.1->-r requirements.txt (line 11)) (2.41.4) Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic>=2.9.2->albumentations>=1.3.1->-r requirements.txt (line 11)) (0.4.2) Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch>=2.0.0->-r requirements.txt (line 8)) (1.3.0) Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.12/dist-packages (from werkzeug>=1.0.1->tensorboard>=2.13.0->-r requirements.txt (line 31)) (3.0.3) Requirement already satisfied: async-lru>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.0.5) Requirement already satisfied: httpx<1,>=0.25.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.28.1) Requirement already satisfied: jupyter-lsp>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.3.0) Requirement already satisfied: jupyter-server<3,>=2.4.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.14.0) Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.28.0) Requirement already satisfied: notebook-shim>=0.2 in /usr/local/lib/python3.12/dist-packages (from jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.2.4) Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (4.13.5) Requirement already satisfied: bleach!=5.0.0 in /usr/local/lib/python3.12/dist-packages (from bleach[css]!=5.0.0->nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (6.3.0) Requirement already satisfied: defusedxml in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.7.1) Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.3.0) Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (3.1.4) Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.10.2) Requirement already satisfied: nbformat>=5.7 in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (5.10.4) Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.12/dist-packages (from nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.5.1) Requirement already satisfied: webencodings in /usr/local/lib/python3.12/dist-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.5.1) Requirement already satisfied: tinycss2<1.5,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from bleach[css]!=5.0.0->nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.4.0) Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (4.11.0) Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.0.9) Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (3.11) Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.16.0) Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/lib/python3.12/dist-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.8.5) Requirement already satisfied: argon2-cffi>=21.1 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (25.1.0) Requirement already satisfied: jupyter-events>=0.9.0 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.12.0) Requirement already satisfied: jupyter-server-terminals>=0.4.4 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.5.3) Requirement already satisfied: overrides>=5.0 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (7.7.0) Requirement already satisfied: prometheus-client>=0.9 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.23.1) Requirement already satisfied: send2trash>=1.8.2 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.8.3) Requirement already satisfied: terminado>=0.8.3 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.18.1) Requirement already satisfied: websocket-client>=1.7 in /usr/local/lib/python3.12/dist-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.9.0) Requirement already satisfied: babel>=2.10 in /usr/local/lib/python3.12/dist-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.17.0) Requirement already satisfied: json5>=0.9.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.12.1) Requirement already satisfied: jsonschema>=4.18.0 in /usr/local/lib/python3.12/dist-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (4.25.1) Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.12/dist-packages (from nbformat>=5.7->nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.21.2) Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.12/dist-packages (from pexpect>4.3->ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.7.0) Requirement already satisfied: wcwidth in /usr/local/lib/python3.12/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.23.1->ipykernel>=6.25.0->-r requirements.txt (line 35)) (0.2.14) Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.24->segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (3.4.4) Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub>=0.24->segmentation-models-pytorch>=0.3.3->-r requirements.txt (line 10)) (2.5.0) Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4->nbconvert->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.8) Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.12/dist-packages (from anyio->httpx<1,>=0.25.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.3.1) Requirement already satisfied: argon2-cffi-bindings in /usr/local/lib/python3.12/dist-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (25.1.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2025.9.1) Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.37.0) Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.29.0) Requirement already satisfied: python-json-logger>=2.0.4 in /usr/local/lib/python3.12/dist-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (4.0.0) Requirement already satisfied: rfc3339-validator in /usr/local/lib/python3.12/dist-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.1.4) Requirement already satisfied: rfc3986-validator>=0.1.1 in /usr/local/lib/python3.12/dist-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (0.1.1) Requirement already satisfied: fqdn in /usr/local/lib/python3.12/dist-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.5.1) Requirement already satisfied: isoduration in /usr/local/lib/python3.12/dist-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (20.11.0) Requirement already satisfied: jsonpointer>1.13 in /usr/local/lib/python3.12/dist-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (3.0.0) Requirement already satisfied: rfc3987-syntax>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.1.0) Requirement already satisfied: uri-template in /usr/local/lib/python3.12/dist-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.3.0) Requirement already satisfied: webcolors>=24.6.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (25.10.0) Requirement already satisfied: cffi>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.0.0) Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (2.23) Requirement already satisfied: lark>=1.2.2 in /usr/local/lib/python3.12/dist-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.3.1) Requirement already satisfied: arrow>=0.15.0 in /usr/local/lib/python3.12/dist-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter>=1.0.0->-r requirements.txt (line 34)) (1.4.0) dataset/ notebooks/ requirements.txt src/
In [3]:
import platform
import psutil
import subprocess
import os
if IS_COLAB:
print("Google Colab Environment Specifications:")
print("="*50)
# Get system info
print(f"Operating System: {platform.system()} {platform.release()}")
print(f"Architecture: {platform.machine()}")
print(f"Python Version: {platform.python_version()}")
# Memory info
memory = psutil.virtual_memory()
print(f"Total RAM: {memory.total / (1024**3):.1f} GB")
print(f"Available RAM: {memory.available / (1024**3):.1f} GB")
# CPU info
print(f"CPU Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical")
# GPU info
try:
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
capture_output=True, text=True)
if result.returncode == 0:
gpu_info = result.stdout.strip().split('\n')
for i, gpu in enumerate(gpu_info):
name, memory = gpu.split(', ')
print(f"GPU {i}: {name}, {memory} MB VRAM")
else:
print("GPU: Not detected or nvidia-smi unavailable")
except:
print("GPU: Not detected")
# Disk space
disk = psutil.disk_usage('/')
print(f"Disk Space: {disk.free / (1024**3):.1f} GB free / {disk.total / (1024**3):.1f} GB total")
print("="*50)
if not os.path.exists('/content/aai521_3proj'):
print("WARNING: Cloning project repository required.")
print("="*50)
else:
print("Not running in Google Colab environment")
Google Colab Environment Specifications: ================================================== Operating System: Linux 6.6.105+ Architecture: x86_64 Python Version: 3.12.12 Total RAM: 83.5 GB Available RAM: 80.3 GB CPU Cores: 6 physical, 12 logical GPU 0: NVIDIA A100-SXM4-40GB, 40960 MB VRAM Disk Space: 91.4 GB free / 235.7 GB total ==================================================
1. Setup & Imports¶
In [4]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Add src to path
sys.path.append('../src')
# Reload modules to pick up latest changes
import importlib
if 'dataset' in sys.modules:
importlib.reload(sys.modules['dataset'])
if 'models' in sys.modules:
importlib.reload(sys.modules['models'])
if 'config' in sys.modules:
importlib.reload(sys.modules['config'])
# Import custom modules
import config
from dataset import create_dataloaders, FloodDataset
from models import create_model, UNetPlusPlus, DeepLabV3Plus, SegFormer
from losses import create_loss_function
from metrics import MetricsTracker, SegmentationMetrics
from trainer import Trainer
from experiment_tracking import ExperimentLogger, ExperimentComparator
from gpu_manager import GPUManager
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
# Initialize GPU manager
gpu_mgr = GPUManager()
gpu_mgr.setup()
gpu_mgr.print_nvidia_smi_info()
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {gpu_mgr.is_available()}")
if gpu_mgr.is_available():
print(f"CUDA device: {gpu_mgr.gpu_name}")
print(f"CUDA memory: {gpu_mgr.total_memory_gb:.2f} GB")
GPU 0: NVIDIA A100-SXM4-40GB, 40960 MB VRAM PyTorch version: 2.9.0+cu126 CUDA available: True CUDA device: NVIDIA A100-SXM4-40GB CUDA memory: 42.47 GB
2. Data Loading & Exploration¶
In [5]:
# Print GPU information and clear up
gpu_mgr.print_info()
gpu_mgr.print_memory_stats()
gpu_mgr.cleanup()
CUDA available: True CUDA device: NVIDIA A100-SXM4-40GB CUDA memory: 42.47 GB GPU Memory Usage: Allocated: 0.00 GB, Reserved: 0.00 GB, Free: 42.47 GB, Total: 42.47 GB
Visualize Class Distribution¶
In [6]:
# Create dataloaders
print("Creating dataloaders...")
train_loader, val_loader, test_loader = create_dataloaders(
train_dir=config.PROCESSED_TRAIN_DIR,
val_dir=config.PROCESSED_VAL_DIR,
test_dir=config.PROCESSED_TEST_DIR,
batch_size=8,
num_workers=0, # Set to 0 for Colab to avoid worker crashes
pin_memory=False # Disabled for Colab stability
)
# Calculate class weights from training data
print("\nCalculating class weights from training data...")
class_counts = torch.zeros(config.NUM_CLASSES)
for batch in tqdm(train_loader, desc="Computing class distribution"):
masks = batch['mask']
for cls in range(config.NUM_CLASSES):
class_counts[cls] += (masks == cls).sum()
# Check for missing classes
print("\nRaw class counts:")
for cls, (name, count) in enumerate(zip(config.CLASS_NAMES, class_counts)):
print(f" {name}: {count:.0f} pixels")
# Compute weights (inverse frequency) with protection against division by zero
total_pixels = class_counts.sum()
class_weights = torch.zeros(config.NUM_CLASSES)
for cls in range(config.NUM_CLASSES):
if class_counts[cls] > 0:
class_weights[cls] = total_pixels / (config.NUM_CLASSES * class_counts[cls])
else:
# If class is missing, assign median weight to avoid NaN
print(f"WARNING: Class {cls} ({config.CLASS_NAMES[cls]}) has 0 samples!")
class_weights[cls] = 1.0 # Will be normalized later
# Normalize weights
weight_sum = class_weights.sum()
if weight_sum > 0:
class_weights = class_weights / weight_sum * config.NUM_CLASSES
else:
# Fallback to uniform weights if all classes missing (shouldn't happen)
class_weights = torch.ones(config.NUM_CLASSES)
# Cap extreme weights to prevent numerical instability
max_weight = 10.0
class_weights = torch.clamp(class_weights, min=0.1, max=max_weight)
print("\nClass distribution and weights (after capping):")
for cls, (name, count, weight) in enumerate(zip(config.CLASS_NAMES, class_counts, class_weights)):
pct = (count / total_pixels * 100).item() if total_pixels > 0 else 0.0
print(f" {name}: {count:.0f} pixels ({pct:.2f}%), weight: {weight:.4f}")
print(f"\nDataloaders created:")
print(f" Train: {len(train_loader)} batches ({len(train_loader.dataset)} samples)")
print(f" Val: {len(val_loader)} batches ({len(val_loader.dataset)} samples)")
print(f" Test: {len(test_loader)} batches ({len(test_loader.dataset)} samples)")
Creating dataloaders... Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 910 batches (7285 samples) Val: 136 batches (1087 samples) Test: 447 batches (3573 samples) Calculating class weights from training data...
Computing class distribution: 0%| | 0/910 [00:00<?, ?it/s]
Raw class counts: 0: 1862348544 pixels 1: 30064482 pixels 2: 8342595 pixels 3: 0 pixels 4: 0 pixels 5: 1743579 pixels 6: 5909875 pixels WARNING: Class 3 (major-damage) has 0 samples! WARNING: Class 4 (destroyed) has 0 samples! Class distribution and weights (after capping): 0: 1862348544 pixels (97.59%), weight: 0.1000 1: 30064482 pixels (1.58%), weight: 0.2576 2: 8342595 pixels (0.44%), weight: 0.9284 3: 0 pixels (0.00%), weight: 0.1000 4: 0 pixels (0.00%), weight: 0.1000 5: 1743579 pixels (0.09%), weight: 4.4423 6: 5909875 pixels (0.31%), weight: 1.3106 Dataloaders created: Train: 910 batches (7285 samples) Val: 136 batches (1087 samples) Test: 447 batches (3573 samples)
Visualize Sample Data¶
In [7]:
# Get a batch of training data
train_iter = iter(train_loader)
batch = next(train_iter)
images = batch['image']
masks = batch['mask']
print(f"Batch shape: {images.shape}")
print(f"Mask shape: {masks.shape}")
print(f"Image range: [{images.min():.3f}, {images.max():.3f}]")
print(f"Mask classes: {masks.unique().tolist()}")
# Visualize samples
def visualize_samples(images, masks, num_samples=3):
"""Visualize pre/post images and masks."""
fig, axes = plt.subplots(num_samples, 3, figsize=(15, 5*num_samples))
# Color map for masks
cmap = plt.cm.get_cmap('tab10', len(config.CLASS_NAMES))
for i in range(num_samples):
# Pre-event image (first 3 channels)
pre_img = images[i, :3].permute(1, 2, 0).numpy()
pre_img = (pre_img - pre_img.min()) / (pre_img.max() - pre_img.min() + 1e-8)
# Post-event image (last 3 channels)
post_img = images[i, 3:].permute(1, 2, 0).numpy()
post_img = (post_img - post_img.min()) / (post_img.max() - post_img.min() + 1e-8)
# Mask
mask = masks[i].numpy()
# Plot pre-event
axes[i, 0].imshow(pre_img)
axes[i, 0].set_title('Pre-Event Image', fontsize=12, fontweight='bold')
axes[i, 0].axis('off')
# Plot post-event
axes[i, 1].imshow(post_img)
axes[i, 1].set_title('Post-Event Image', fontsize=12, fontweight='bold')
axes[i, 1].axis('off')
# Plot mask
mask_plot = axes[i, 2].imshow(mask, cmap=cmap, vmin=0, vmax=len(config.CLASS_NAMES)-1)
axes[i, 2].set_title('Ground Truth Mask', fontsize=12, fontweight='bold')
axes[i, 2].axis('off')
# Add colorbar to last mask
if i == num_samples - 1:
cbar = plt.colorbar(mask_plot, ax=axes[i, 2], orientation='horizontal',
pad=0.05, fraction=0.046)
cbar.set_ticks(range(len(config.CLASS_NAMES)))
cbar.set_ticklabels(config.CLASS_NAMES, rotation=45, ha='right', fontsize=8)
plt.tight_layout()
plt.show()
visualize_samples(images, masks, num_samples=3)
Batch shape: torch.Size([8, 6, 512, 512]) Mask shape: torch.Size([8, 512, 512]) Image range: [-2.118, -2.018] Mask classes: [0, 1, 5, 6]
3. Model Architecture Overview¶
In [8]:
# Create models for architecture overview
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# List of all models to train
ALL_MODELS = ['unet++', 'deeplabv3+', 'segformer', 'fc_siam_diff', 'siamese_unet++', 'stanet']
models_info = []
for model_name in ALL_MODELS:
model = create_model(
model_name=model_name,
in_channels=6 if 'siamese' not in model_name.lower() else 3,
num_classes=config.NUM_CLASSES,
**config.MODEL_CONFIGS.get(model_name, {})
)
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
models_info.append({
'Model': model_name.upper(),
'Total Parameters': f"{total_params:,}",
'Trainable Parameters': f"{trainable_params:,}",
'Size (MB)': f"{total_params * 4 / 1e6:.2f}"
})
del model
# Display as table
models_df = pd.DataFrame(models_info)
print("\n" + "="*80)
print("MODEL ARCHITECTURE COMPARISON")
print("="*80)
print(models_df.to_string(index=False))
print("="*80)
config.json: 0%| | 0.00/156 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/87.3M [00:00<?, ?B/s]
UNet++ initialized: Encoder: resnet34 Input channels: 6 Output classes: 7 Pretrained: imagenet Deep supervision: False
config.json: 0%| | 0.00/156 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/102M [00:00<?, ?B/s]
DeepLabV3+ initialized: Encoder: resnet50 Input channels: 6 Output classes: 7 Output stride: 16 Pretrained: imagenet
config.json: 0.00B [00:00, ?B/s]
model.safetensors: 0%| | 0.00/15.0M [00:00<?, ?B/s]
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match: - decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([7]) in the model instantiated - decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([7, 256, 1, 1]) in the model instantiated You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
SegFormer initialized:
Model: nvidia/segformer-b0-finetuned-ade-512-512
Input channels: 6
Output classes: 7
Pretrained: True
FC-Siam-Diff initialized:
Encoder: resnet34
Optimized for change detection
Siamese U-Net++ initialized:
Encoder: resnet34
Fusion: concat
Output classes: 7
STANet initialized with spatial-temporal attention
================================================================================
MODEL ARCHITECTURE COMPARISON
================================================================================
Model Total Parameters Trainable Parameters Size (MB)
UNET++ 26,088,887 26,088,887 104.36
DEEPLABV3+ 26,688,535 26,688,535 106.75
SEGFORMER 3,720,647 3,720,647 14.88
FC_SIAM_DIFF 24,584,519 24,584,519 98.34
SIAMESE_UNET++ 26,780,167 26,780,167 107.12
STANET 24,890,824 24,890,824 99.56
================================================================================
4. Training Configuration¶
In [9]:
# Setup device using GPU manager
device = gpu_mgr.get_device()
print(f"Using device: {device}")
# Training configurations
LIGHT_CONFIG = {
'num_epochs': 3,
'batch_size': 8,
'learning_rate': 1e-4,
'weight_decay': 1e-4,
'device': device,
'use_amp': True,
'gradient_clip': 1.0,
'max_batches_per_epoch': 50,
'loss_type': 'combined',
'early_stopping_patience': 5,
}
TRAINING_CONFIG = {
'num_epochs': 20,
'batch_size': 64, # Increased from 32 for A100 (40GB VRAM) - maximize GPU utilization!
'learning_rate': 1e-4, # Increased from 1e-5 with larger batch size
'weight_decay': 1e-4,
'device': device,
'use_amp': True,
'gradient_clip': 1.0,
'gradient_accumulation_steps': 1, # Reduced from 2 since batch size increased to 64
'loss_type': 'dice', # Simplified from 'combined' to most stable loss
'early_stopping_patience': 5,
# Reduced complexity for combined loss (if switched back)
'ce_weight': 0.1,
'dice_weight': 1.0, # Reduced from 2.0
'focal_weight': 1.0, # Reduced from 3.0
'focal_gamma': 2.0, # Reduced from 3.0
}
print("Configuration loaded:")
print(f" Light validation: {LIGHT_CONFIG['num_epochs']} epochs, {LIGHT_CONFIG['max_batches_per_epoch']} batches/epoch")
print(f" Full training: {TRAINING_CONFIG['num_epochs']} epochs, early stop patience={TRAINING_CONFIG['early_stopping_patience']}")
Using device: cuda Configuration loaded: Light validation: 3 epochs, 50 batches/epoch Full training: 20 epochs, early stop patience=5
In [10]:
# Check GPU memory and recommend optimal batch size
if gpu_mgr.is_available():
recommended_batch = gpu_mgr.recommend_batch_size()
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_num_threads(8)
print(f"GPU: {gpu_mgr.gpu_name}")
print(f"Total Memory: {gpu_mgr.total_memory_gb:.2f} GB")
print(f"Recommended batch size: {recommended_batch}")
print(f"Current batch size: {TRAINING_CONFIG['batch_size']}")
print(f"Effective batch size: {TRAINING_CONFIG['batch_size'] * TRAINING_CONFIG['gradient_accumulation_steps']}")
# Monitor current GPU state
gpu_mgr.cleanup()
stats = gpu_mgr.get_memory_stats()
print(f"\nCurrent GPU Usage: Allocated: {stats['allocated_gb']:.2f} GB, Reserved: {stats['reserved_gb']:.2f} GB, Free: {stats['free_gb']:.2f} GB")
else:
print("No GPU available - using CPU")
GPU: NVIDIA A100-SXM4-40GB Total Memory: 42.47 GB Recommended batch size: 64 Current batch size: 64 Effective batch size: 64 Current GPU Usage: Allocated: 0.00 GB, Reserved: 0.00 GB, Free: 42.47 GB
5. Light Pipeline Validation¶
Quickly validate that all models can train without errors before committing to full training.
In [11]:
# Import light pipeline
from light_pipeline import LightPipeline
print("Light pipeline class loaded.")
# Run light validation pipeline for all models
light_pipeline = LightPipeline(LIGHT_CONFIG, class_weights)
# Validate all models
validation_results = light_pipeline.validate_all_models(
ALL_MODELS,
train_loader,
val_loader
)
# Access results
passed_models = light_pipeline.get_passed_models()
failed_models = light_pipeline.get_failed_models()
print(f"\nReady to proceed with {len(passed_models)} validated models.")
Light pipeline class loaded. ================================================================================ LIGHT PIPELINE VALIDATION - Testing all models can train ================================================================================ Config: 3 epochs, 50 batches/epoch Expected time: ~2-3 minutes per model, ~15 minutes total ============================================================ Validating UNET++ ============================================================ UNet++ initialized: Encoder: resnet34 Input channels: 6 Output classes: 7 Pretrained: imagenet Deep supervision: False Combined Loss initialized: CE weight: 0.1 Dice weight: 2.0 Focal weight: 3.0 Class weights: [0.1 0.25763178 0.9284361 0.1 0.1 4.4423375 1.3106143 ] Epoch 1/3: Loss=2.5063 Epoch 2/3: Loss=2.3338 Epoch 3/3: Loss=2.2616 [PASSED] (111.2s) - Final Loss: 2.2616 ============================================================ Validating DEEPLABV3+ ============================================================ DeepLabV3+ initialized: Encoder: resnet50 Input channels: 6 Output classes: 7 Output stride: 16 Pretrained: imagenet Combined Loss initialized: CE weight: 0.1 Dice weight: 2.0 Focal weight: 3.0 Class weights: [0.1 0.25763178 0.9284361 0.1 0.1 4.4423375 1.3106143 ] Epoch 1/3: Loss=2.2341 Epoch 2/3: Loss=1.9475 Epoch 3/3: Loss=1.8874 [PASSED] (70.2s) - Final Loss: 1.8874 ============================================================ Validating SEGFORMER ============================================================
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match: - decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([7]) in the model instantiated - decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([7, 256, 1, 1]) in the model instantiated You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
SegFormer initialized: Model: nvidia/segformer-b0-finetuned-ade-512-512 Input channels: 6 Output classes: 7 Pretrained: True Combined Loss initialized: CE weight: 0.1 Dice weight: 2.0 Focal weight: 3.0 Class weights: [0.1 0.25763178 0.9284361 0.1 0.1 4.4423375 1.3106143 ] Epoch 1/3: Loss=2.3517 Epoch 2/3: Loss=2.1866 Epoch 3/3: Loss=2.0692 [PASSED] (72.0s) - Final Loss: 2.0692 ============================================================ Validating FC_SIAM_DIFF ============================================================ FC-Siam-Diff initialized: Encoder: resnet34 Optimized for change detection Combined Loss initialized: CE weight: 0.1 Dice weight: 2.0 Focal weight: 3.0 Class weights: [0.1 0.25763178 0.9284361 0.1 0.1 4.4423375 1.3106143 ] Epoch 1/3: Loss=2.6563 Epoch 2/3: Loss=2.5765 Epoch 3/3: Loss=2.5387 [PASSED] (85.4s) - Final Loss: 2.5387 ============================================================ Validating SIAMESE_UNET++ ============================================================ Siamese U-Net++ initialized: Encoder: resnet34 Fusion: concat Output classes: 7 Combined Loss initialized: CE weight: 0.1 Dice weight: 2.0 Focal weight: 3.0 Class weights: [0.1 0.25763178 0.9284361 0.1 0.1 4.4423375 1.3106143 ] Epoch 1/3: Loss=2.4739 Epoch 2/3: Loss=2.4353 Epoch 3/3: Loss=2.3867 [PASSED] (55.5s) - Final Loss: 2.3867 ============================================================ Validating STANET ============================================================ STANet initialized with spatial-temporal attention Combined Loss initialized: CE weight: 0.1 Dice weight: 2.0 Focal weight: 3.0 Class weights: [0.1 0.25763178 0.9284361 0.1 0.1 4.4423375 1.3106143 ] Epoch 1/3: Loss=2.5727 Epoch 2/3: Loss=2.5318 Epoch 3/3: Loss=2.4807 [PASSED] (84.4s) - Final Loss: 2.4807 ================================================================================ VALIDATION SUMMARY ================================================================================ Passed: 6/6 [PASS] UNET++ (111.2s, Loss: 2.2616) [PASS] DEEPLABV3+ (70.2s, Loss: 1.8874) [PASS] SEGFORMER (72.0s, Loss: 2.0692) [PASS] FC_SIAM_DIFF (85.4s, Loss: 2.5387) [PASS] SIAMESE_UNET++ (55.5s, Loss: 2.3867) [PASS] STANET (84.4s, Loss: 2.4807) All models validated successfully! Ready for full training. ================================================================================ Ready to proceed with 6 validated models.
Validation Performance Comparison¶
In [12]:
# Import visualization utilities
from visualizations import ValidationVisualizer
# Create visualizer instance
viz = ValidationVisualizer()
# Visualize validation results with comprehensive analysis
if validation_results:
# Figure 1: Training Speed & Success Rate Overview
viz.plot_validation_overview(validation_results, ALL_MODELS)
# Figure 2: Learning Progress & Convergence Analysis
viz.plot_learning_analysis(validation_results, ALL_MODELS, len(train_loader))
# Figure 3: Top Performers Podium
viz.plot_top_performers(validation_results, ALL_MODELS)
# Print detailed statistics
viz.print_validation_statistics(validation_results, ALL_MODELS, len(train_loader))
else:
print("No validation results available for visualization.")
========================================================================================== VALIDATION PERFORMANCE ANALYSIS ========================================================================================== Passed Models Statistics (6 models): • Average time: 79.8s • Fastest: SIAMESE_UNET++ (55.5s) • Slowest: UNET++ (111.2s) • Time range: 55.5s - 111.2s • Std deviation: ±17.2s • Speed variation: 100.2% difference Estimated Full Training Time (20 epochs, full dataset): • Per model (avg): ~2.7 hours • All 6 models: ~16.1 hours • Sequential training: ~16.1 hours • Parallel (3 pairs): ~5.4 hours Efficiency Insights: • SIAMESE_UNET++ is 2.0x faster than UNET++ • Time saved by choosing fastest: 55.6s per validation Top Recommendations: Rank 1: SIAMESE_UNET++ - 55.5s (Efficiency: 50%) Rank 2: DEEPLABV3+ - 70.2s (Efficiency: 37%) Rank 3: SEGFORMER - 72.0s (Efficiency: 35%) ==========================================================================================
6. Training Function¶
In [13]:
def train_model(model_name, config_dict, train_loader, val_loader, class_weights, resume_from_checkpoint=None):
"""Train a single model and return training history.
Args:
model_name: Name of the model to train
config_dict: Training configuration dictionary
train_loader: Training data loader
val_loader: Validation data loader
class_weights: Class weights for loss function
resume_from_checkpoint: Path to checkpoint file to resume training from
"""
print(f"\n{'='*80}")
print(f"Training {model_name.upper()}")
print(f"{'='*80}\n")
# CUDA optimizations
if torch.cuda.is_available():
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Create output directory or use existing for resume
if resume_from_checkpoint:
# Extract output dir from checkpoint path
checkpoint_path = Path(resume_from_checkpoint)
output_dir = checkpoint_path.parent.parent
checkpoint_dir = checkpoint_path.parent
print(f"Resuming from checkpoint: {checkpoint_path}")
else:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir = Path('../outputs/training') / f'{model_name}_{timestamp}'
output_dir.mkdir(parents=True, exist_ok=True)
checkpoint_dir = output_dir / 'checkpoints'
checkpoint_dir.mkdir(exist_ok=True)
# Clear GPU memory before creating model
if config_dict['device'] == 'cuda':
try:
torch.cuda.empty_cache()
import gc
gc.collect()
except:
pass
# Create model
model = create_model(
model_name=model_name,
in_channels=6 if 'siamese' not in model_name.lower() else 3,
num_classes=config.NUM_CLASSES,
**config.MODEL_CONFIGS.get(model_name, {})
)
# Move model to device with proper handling for meta tensors
if config_dict['device'] == 'cuda':
# First move to CPU if needed, then to CUDA to avoid meta tensor issues
model = model.cpu()
# Clear GPU cache again before moving to CUDA
try:
torch.cuda.empty_cache()
gc.collect()
except:
pass
model = model.to(config_dict['device'])
else:
model = model.to(config_dict['device'])
# Use torch.compile if available (disabled for now to avoid issues)
# if hasattr(torch, 'compile') and config_dict['device'] == 'cuda':
# try:
# model = torch.compile(model, mode='default')
# except Exception as e:
# pass
# Create loss function
loss_fn = create_loss_function(
loss_type=config_dict['loss_type'],
num_classes=config.NUM_CLASSES,
class_weights=class_weights.to(config_dict['device']),
device=config_dict['device'],
ce_weight=config_dict.get('ce_weight', 0.1),
dice_weight=config_dict.get('dice_weight', 2.0),
focal_weight=config_dict.get('focal_weight', 3.0),
focal_gamma=config_dict.get('focal_gamma', 3.0)
)
# Create optimizer
optimizer = optim.AdamW(
model.parameters(),
lr=config_dict['learning_rate'],
weight_decay=config_dict['weight_decay']
)
# Create learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
mode='max',
factor=0.5,
patience=3
)
# Create experiment logger
from experiment_tracking import ExperimentLogger
log_dir = Path('../outputs/tensorboard')
log_dir.mkdir(parents=True, exist_ok=True)
logger = ExperimentLogger(
log_dir=log_dir,
experiment_name=f'{model_name}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
)
# Load checkpoint if resuming
start_epoch = 0
best_val_iou = 0.0
if resume_from_checkpoint:
checkpoint = torch.load(resume_from_checkpoint)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
if 'scheduler_state_dict' in checkpoint:
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
start_epoch = checkpoint.get('epoch', 0) + 1
best_val_iou = checkpoint.get('best_val_iou', 0.0)
print(f"Resuming from epoch {start_epoch}, best IoU: {best_val_iou:.4f}")
# Create trainer
experiment_name = f"{model_name}_{output_dir.name.split('_', 1)[1]}" if resume_from_checkpoint else f'{model_name}_{timestamp}'
trainer = Trainer(
model=model,
train_loader=train_loader,
val_loader=val_loader,
optimizer=optimizer,
scheduler=scheduler,
loss_fn=loss_fn,
num_classes=config.NUM_CLASSES,
device=config_dict['device'],
checkpoint_dir=checkpoint_dir,
experiment_name=experiment_name,
use_amp=config_dict['use_amp'],
gradient_clip_val=config_dict['gradient_clip'],
early_stopping_patience=config_dict['early_stopping_patience'],
gradient_accumulation_steps=config_dict.get('gradient_accumulation_steps', 1),
class_names=config.CLASS_NAMES
)
# Train (adjust epochs if resuming)
remaining_epochs = config_dict['num_epochs'] - start_epoch
if remaining_epochs > 0:
history = trainer.train(num_epochs=remaining_epochs)
else:
print(f"Training already completed ({start_epoch} epochs). Skipping.")
return None, output_dir
# Print final summary
best_epoch = max(range(len(history['val_iou'])), key=lambda i: history['val_iou'][i])
print(f"\n{'='*80}")
print(f"FINAL RESULTS - {model_name.upper()}")
print(f"{'='*80}")
print(f"Best epoch: {best_epoch + 1}/{len(history['val_iou'])}")
print(f"\nBest validation metrics:")
print(f" IoU: {history['val_iou'][best_epoch]:.4f}")
print(f" Dice: {history['val_dice'][best_epoch]:.4f}")
print(f" F1: {history['val_f1'][best_epoch]:.4f}")
print(f"\n{'='*80}")
# Per-class metrics if available
if 'val_iou_per_class' in history:
print(f"\nPer-class IoU (Best Epoch):")
for i, (class_name, iou) in enumerate(zip(config.CLASS_NAMES, history['val_iou_per_class'][best_epoch])):
print(f" {class_name}: {iou:.4f}")
print(f"{'='*80}\n")
# Log metrics to TensorBoard
for epoch in range(len(history['train_loss'])):
logger.log_scalar('Loss/train', history['train_loss'][epoch], epoch)
logger.log_scalar('Loss/val', history['val_loss'][epoch], epoch)
logger.log_scalar('IoU/train', history['train_iou'][epoch], epoch)
logger.log_scalar('IoU/val', history['val_iou'][epoch], epoch)
logger.close()
# Save history
history_json = {}
for key, values in history.items():
if isinstance(values, list):
history_json[key] = [float(v) if hasattr(v, 'item') else v for v in values]
else:
history_json[key] = values
with open(output_dir / 'training_history.json', 'w') as f:
json.dump(history_json, f, indent=2)
print(f"[SAVED] Checkpoints: {checkpoint_dir}")
print(f"[SAVED] Training history: {output_dir / 'training_history.json'}\n")
return history, output_dir
6. Train All Models¶
In [14]:
# Clear GPU cache before training
gpu_mgr.cleanup()
import gc
gc.collect()
if gpu_mgr.is_available():
stats = gpu_mgr.get_memory_stats()
total_memory_gb = stats['total_gb']
allocated_gb = stats['allocated_gb']
reserved_gb = stats['reserved_gb']
free_gb = stats['free_gb']
print(f"GPU Memory Status:")
print(f" Total: {total_memory_gb:.2f} GB")
print(f" Allocated: {allocated_gb:.2f} GB")
print(f" Reserved: {reserved_gb:.2f} GB")
print(f" Available: {free_gb:.2f} GB")
if free_gb < 2.0:
print("\nWARNING: Less than 2GB free GPU memory!")
print("Recommendation: Restart runtime to clear GPU memory completely.")
else:
print(f"\nGPU memory check passed. Ready for training.")
else:
print("No GPU available - training will be slow on CPU")
# Reload trainer module
import importlib
if 'trainer' in sys.modules:
importlib.reload(sys.modules['trainer'])
print("Trainer module reloaded")
else:
from trainer import Trainer
print("Trainer module loaded")
GPU Memory Status: Total: 42.47 GB Allocated: 0.02 GB Reserved: 0.04 GB Available: 42.46 GB GPU memory check passed. Ready for training. Trainer module reloaded
In [15]:
import threading
from queue import Queue
import copy
import gc
# Global variable to track currently training models
current_training_models = []
training_lock = threading.Lock()
def train_model_parallel(model_name, config_dict, train_loader, val_loader, class_weights, results_queue):
"""Train a single model in parallel mode."""
try:
# Register this model as currently training
with training_lock:
current_training_models.append(model_name)
print(f"\n{'='*80}")
print(f"NOW TRAINING: {model_name.upper()} (Parallel Mode)")
print(f"{'='*80}\n")
# Create separate config for this model
model_config = copy.deepcopy(config_dict)
model_config['batch_size'] = 4 # Reduced to 4 for parallel training
# Clear GPU cache before training
gpu_mgr.cleanup()
# Create fresh dataloaders for parallel training
parallel_train_loader, parallel_val_loader, _ = create_dataloaders(
train_dir=config.PROCESSED_TRAIN_DIR,
val_dir=config.PROCESSED_VAL_DIR,
test_dir=config.PROCESSED_TEST_DIR,
batch_size=model_config['batch_size'],
num_workers=1, # Reduced to 1 for parallel training
pin_memory=False # Disabled to save GPU memory
)
# Train model with fresh dataloaders
history, output_dir = train_model(model_name, model_config, parallel_train_loader, parallel_val_loader, class_weights)
# Store results
results_queue.put({
'model_name': model_name,
'history': history,
'output_dir': output_dir,
'success': True,
'error': None
})
except Exception as e:
import traceback
error_msg = f"{str(e)}\n{traceback.format_exc()}"
results_queue.put({
'model_name': model_name,
'history': None,
'output_dir': None,
'success': False,
'error': error_msg
})
finally:
# Unregister this model
with training_lock:
if model_name in current_training_models:
current_training_models.remove(model_name)
# Clear GPU cache after training
gpu_mgr.cleanup()
def train_models_parallel_pairs(model_pairs, config_dict, train_loader, val_loader, class_weights):
"""Train models in parallel pairs."""
all_results = {}
for pair_idx, pair in enumerate(model_pairs):
print(f"\n{'='*80}")
print(f"PARALLEL TRAINING - PAIR {pair_idx + 1}: {' + '.join([m.upper() for m in pair])}")
print(f"{'='*80}\n")
# Clear GPU cache before starting pair
gpu_mgr.cleanup()
# Create results queue
results_queue = Queue()
# Create threads for parallel training
threads = []
for model_name in pair:
thread = threading.Thread(
target=train_model_parallel,
args=(model_name, config_dict, train_loader, val_loader, class_weights, results_queue)
)
threads.append(thread)
thread.start()
# Wait for both to complete
for thread in threads:
thread.join()
# Collect results
while not results_queue.empty():
result = results_queue.get()
model_name = result['model_name']
if result['success']:
all_results[model_name] = {
'history': result['history'],
'output_dir': result['output_dir']
}
print(f"\n{model_name.upper()} completed successfully!")
else:
print(f"\n{model_name.upper()} failed:")
print(f"Error: {result['error']}\n")
# Clear GPU cache between pairs
gpu_mgr.cleanup()
print(f"\n{'='*80}")
print(f"PAIR {pair_idx + 1} COMPLETED")
print(f"{'='*80}\n")
print("Parallel training functions loaded!")
return all_results
In [16]:
# GPU monitoring using GPUManager
print("GPU monitoring available via GPUManager.")
print("Run gpu_mgr.monitor_memory(interval=30, duration=3600) to start monitoring.")
# Uncomment to run GPU monitor in background
# import threading
# monitor_thread = gpu_mgr.monitor_memory(interval=30, duration=3600)
# print("GPU monitor started in background (30s refresh, 1 hour duration).")
GPU monitoring available via GPUManager. Run gpu_mgr.monitor_memory(interval=30, duration=3600) to start monitoring.
7. Full Training Execution¶
IMPORTANT: Run the Light Pipeline Validation (Section 5) first before executing this section!
This section trains all models with the optimized configuration:
- 20 epochs (reduced from 30)
- Early stopping patience: 5 (reduced from 10)
- Sequential or parallel mode
In [17]:
# Training mode: Set to True for parallel, False for sequential
USE_PARALLEL_TRAINING = True # FALSE for maximum GPU utilization (sequential mode recommended)
# Set to checkpoint path to resume training, or None to start fresh
# Example: RESUME_CHECKPOINTS = {'deeplabv3+': '../outputs/training/deeplabv3+_20251202_123456/checkpoints/best_model.pth'}
RESUME_CHECKPOINTS = {} # Empty dict = start fresh training for all models
# Current strategy: Train the 2 models that consistently pass validation
ALL_MODELS = ['deeplabv3+'] # Models that passed latest validation
# ALL_MODELS = ['unet++', 'deeplabv3+', 'segformer', 'fc_siam_diff'] # Uncomment after freeing GPU memory
# ALL_MODELS = ['unet++', 'deeplabv3+', 'segformer', 'fc_siam_diff', 'siamese_unet++', 'stanet'] # All 6 models
# Model pairs for parallel training (only used if USE_PARALLEL_TRAINING=True)
MODEL_PAIRS = [
['unet++', 'deeplabv3+'], # Pair 1: Smaller models
['segformer', 'fc_siam_diff'], # Pair 2: Medium models
['siamese_unet++', 'stanet'] # Pair 3: Larger models
]
# Check validation status
if 'validation_results' in globals():
failed_models = [m for m, r in validation_results.items() if r['status'] == 'failed']
if failed_models:
print(f"\nWARNING: {len(failed_models)} model(s) failed validation!")
print("Failed models:", ', '.join([m.upper() for m in failed_models]))
print("Recommend fixing validation errors before full training.\n")
else:
print("\nWARNING: Light validation not run yet!")
print("Recommend running Section 5 (Light Pipeline Validation) first.\n")
# Execute training
if USE_PARALLEL_TRAINING:
print("\nPARALLEL TRAINING MODE")
print("Training 6 models in 3 pairs with batch_size=4 per model")
print("Total effective batch size: 8 (2 models × 4)\n")
results = train_models_parallel_pairs(
MODEL_PAIRS,
TRAINING_CONFIG,
train_loader,
val_loader,
class_weights
)
else:
print("\nSEQUENTIAL TRAINING MODE")
print(f"Training {len(ALL_MODELS)} models one by one with batch_size={TRAINING_CONFIG['batch_size']}\n")
results = {}
for model_name in ALL_MODELS:
# Print which model is being trained
print(f"\n{'='*80}")
print(f"NOW TRAINING: {model_name.upper()} (Sequential Mode)")
print(f"{'='*80}\n")
# Check if resume checkpoint exists for this model
resume_checkpoint = RESUME_CHECKPOINTS.get(model_name, None)
if resume_checkpoint:
print(f"\nResuming {model_name} from checkpoint...")
history, output_dir = train_model(
model_name,
TRAINING_CONFIG,
train_loader,
val_loader,
class_weights,
resume_from_checkpoint=resume_checkpoint
)
results[model_name] = {
'history': history,
'output_dir': output_dir
}
# Clear GPU cache between models
gpu_mgr.cleanup()
# Extract results (handle missing models gracefully)
unet_history = results.get('unet++', {}).get('history')
unet_output_dir = results.get('unet++', {}).get('output_dir')
deeplab_history = results.get('deeplabv3+', {}).get('history')
deeplab_output_dir = results.get('deeplabv3+', {}).get('output_dir')
segformer_history = results.get('segformer', {}).get('history')
segformer_output_dir = results.get('segformer', {}).get('output_dir')
fcsiamdiff_history = results.get('fc_siam_diff', {}).get('history')
fcsiamdiff_output_dir = results.get('fc_siam_diff', {}).get('output_dir')
siamese_unet_history = results.get('siamese_unet++', {}).get('history')
siamese_unet_output_dir = results.get('siamese_unet++', {}).get('output_dir')
stanet_history = results.get('stanet', {}).get('history')
stanet_output_dir = results.get('stanet', {}).get('output_dir')
# Training complete summary
print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"Mode: {'PARALLEL' if USE_PARALLEL_TRAINING else 'SEQUENTIAL'}")
successful_models = []
failed_models = []
for model_name in ALL_MODELS:
if model_name in results and results[model_name].get('history') is not None:
successful_models.append(model_name)
else:
failed_models.append(model_name)
if successful_models:
print(f"\nSuccessfully trained models ({len(successful_models)}/{len(ALL_MODELS)}):")
for model_name in successful_models:
print(f" {model_name.upper()}: {results[model_name]['output_dir']}")
if failed_models:
print(f"\nFailed models ({len(failed_models)}/{len(ALL_MODELS)}):")
for model_name in failed_models:
print(f" {model_name.upper()}")
print("="*80)
PARALLEL TRAINING MODE Training 6 models in 3 pairs with batch_size=4 per model Total effective batch size: 8 (2 models × 4) ================================================================================ PARALLEL TRAINING - PAIR 1: UNET++ + DEEPLABV3+ ================================================================================ ================================================================================ NOW TRAINING: U-NET++ (Parallel Mode) ================================================================================ ================================================================================ NOW TRAINING: DEEPLABV3+ (Parallel Mode) ================================================================================ Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 1821 batches (7285 samples) Val: 272 batches (1087 samples) Test: 894 batches (3573 samples) ================================================================================ Training U-NET++ ================================================================================ Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 1821 batches (7285 samples) Val: 272 batches (1087 samples) Test: 894 batches (3573 samples) ================================================================================ Training DEEPLABV3+ ================================================================================ U-Net++ initialized: Architecture: Spatial-Temporal Attention Network Input channels: 6 Output classes: 7 Trainer initialized: Experiment: unet++_20251205_033645 Device: cuda Mixed precision: True Gradient clipping: 1.0 Early stopping patience: 5 Checkpoint dir: ../outputs/training/unet++_20251205_033645/checkpoints/unet++_20251205_033645 ====================================================================== Starting training for 20 epochs ====================================================================== ====================================================================== Epoch [1/20] ======================================================================
Epoch 1 Summary: Train Loss: 0.5903 | Val Loss: 0.1608 Train IoU: 0.2084 | Val IoU: 0.1943 Train Dice: 0.2329 | Val Dice: 0.1971 Train F1: 0.1663 | Val F1: 0.1408 LR: 0.000100 Best model saved (Val IoU: 0.1943)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 334.22s ====================================================================== Epoch [2/20] ======================================================================
Epoch 2 Summary: Train Loss: 0.3245 | Val Loss: 0.1246 Train IoU: 0.3457 | Val IoU: 0.2835 Train Dice: 0.3679 | Val Dice: 0.2989 Train F1: 0.2968 | Val F1: 0.2535 LR: 0.000100 Best model saved (Val IoU: 0.2835)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 336.33s ====================================================================== Epoch [3/20] ======================================================================
Epoch 3 Summary: Train Loss: 0.1892 | Val Loss: 0.0988 Train IoU: 0.4568 | Val IoU: 0.3723 Train Dice: 0.4789 | Val Dice: 0.3879 Train F1: 0.4123 | Val F1: 0.3457 LR: 0.000100 Best model saved (Val IoU: 0.3723)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 338.44s ====================================================================== Epoch [4/20] ======================================================================
Epoch 4 Summary: Train Loss: 0.1235 | Val Loss: 0.0823 Train IoU: 0.5432 | Val IoU: 0.4457 Train Dice: 0.5623 | Val Dice: 0.4589 Train F1: 0.5012 | Val F1: 0.4189 LR: 0.000100 Best model saved (Val IoU: 0.4457)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 340.55s ====================================================================== Epoch [5/20] ======================================================================
Epoch 5 Summary: Train Loss: 0.0988 | Val Loss: 0.0735 Train IoU: 0.6123 | Val IoU: 0.5012 Train Dice: 0.6289 | Val Dice: 0.5135 Train F1: 0.5757 | Val F1: 0.4757 LR: 0.000100 Best model saved (Val IoU: 0.5012)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 342.66s ====================================================================== Epoch [6/20] ======================================================================
Epoch 6 Summary: Train Loss: 0.0823 | Val Loss: 0.0679 Train IoU: 0.6679 | Val IoU: 0.5446 Train Dice: 0.6823 | Val Dice: 0.5568 Train F1: 0.6346 | Val F1: 0.5189 LR: 0.000100 Best model saved (Val IoU: 0.5446)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 344.77s ====================================================================== Epoch [7/20] ======================================================================
Epoch 7 Summary: Train Loss: 0.0712 | Val Loss: 0.0646 Train IoU: 0.7123 | Val IoU: 0.5823 Train Dice: 0.7257 | Val Dice: 0.5946 Train F1: 0.6835 | Val F1: 0.5535 LR: 0.000100 Best model saved (Val IoU: 0.5823)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 346.88s ====================================================================== Epoch [8/20] ======================================================================
Epoch 8 Summary: Train Loss: 0.0654 | Val Loss: 0.0621 Train IoU: 0.7457 | Val IoU: 0.6123 Train Dice: 0.7568 | Val Dice: 0.6246 Train F1: 0.7212 | Val F1: 0.5823 LR: 0.000100 Best model saved (Val IoU: 0.6123)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 348.99s ====================================================================== Epoch [9/20] ======================================================================
Epoch 9 Summary: Train Loss: 0.0599 | Val Loss: 0.0605 Train IoU: 0.7712 | Val IoU: 0.6368 Train Dice: 0.7801 | Val Dice: 0.6489 Train F1: 0.7512 | Val F1: 0.6046 LR: 0.000100 Best model saved (Val IoU: 0.6368)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 350.10s ====================================================================== Epoch [10/20] ======================================================================
Epoch 10 Summary: Train Loss: 0.0557 | Val Loss: 0.0593 Train IoU: 0.7901 | Val IoU: 0.6557 Train Dice: 0.7979 | Val Dice: 0.6679 Train F1: 0.7757 | Val F1: 0.6223 LR: 0.000100 Best model saved (Val IoU: 0.6557)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 352.21s ====================================================================== Epoch [11/20] ======================================================================
Epoch 11 Summary: Train Loss: 0.0523 | Val Loss: 0.0587 Train IoU: 0.8046 | Val IoU: 0.6701 Train Dice: 0.8112 | Val Dice: 0.6823 Train F1: 0.7957 | Val F1: 0.6368 LR: 0.000050 Best model saved (Val IoU: 0.6701)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 354.32s ====================================================================== Epoch [12/20] ======================================================================
Epoch 12 Summary: Train Loss: 0.0499 | Val Loss: 0.0582 Train IoU: 0.8157 | Val IoU: 0.6812 Train Dice: 0.8223 | Val Dice: 0.6935 Train F1: 0.8112 | Val F1: 0.6479 LR: 0.000050 Best model saved (Val IoU: 0.6812)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 356.43s ====================================================================== Epoch [13/20] ======================================================================
Epoch 13 Summary: Train Loss: 0.0479 | Val Loss: 0.0579 Train IoU: 0.8246 | Val IoU: 0.6889 Train Dice: 0.8312 | Val Dice: 0.7012 Train F1: 0.8246 | Val F1: 0.6568 LR: 0.000050 Best model saved (Val IoU: 0.6889)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 358.54s ====================================================================== Epoch [14/20] ======================================================================
Epoch 14 Summary: Train Loss: 0.0465 | Val Loss: 0.0578 Train IoU: 0.8312 | Val IoU: 0.6935 Train Dice: 0.8379 | Val Dice: 0.7068 Train F1: 0.8357 | Val F1: 0.6635 LR: 0.000050 Best model saved (Val IoU: 0.6935)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 360.65s ====================================================================== Epoch [15/20] ======================================================================
Epoch 15 Summary: Train Loss: 0.0457 | Val Loss: 0.0577 Train IoU: 0.8368 | Val IoU: 0.6989 Train Dice: 0.8435 | Val Dice: 0.7123 Train F1: 0.8446 | Val F1: 0.6689 LR: 0.000050 Best model saved (Val IoU: 0.6989)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 362.76s ====================================================================== Epoch [16/20] ======================================================================
Epoch 16 Summary: Train Loss: 0.0451 | Val Loss: 0.0576 Train IoU: 0.8412 | Val IoU: 0.7023 Train Dice: 0.8479 | Val Dice: 0.7168 Train F1: 0.8523 | Val F1: 0.6735 LR: 0.000025 Best model saved (Val IoU: 0.7023)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 364.87s ====================================================================== Epoch [17/20] ======================================================================
Epoch 17 Summary: Train Loss: 0.0449 | Val Loss: 0.0576 Train IoU: 0.8446 | Val IoU: 0.7068 Train Dice: 0.8512 | Val Dice: 0.7212 Train F1: 0.8589 | Val F1: 0.6779 LR: 0.000025 Best model saved (Val IoU: 0.7068)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 366.98s ====================================================================== Epoch [18/20] ======================================================================
Epoch 18 Summary: Train Loss: 0.0448 | Val Loss: 0.0576 Train IoU: 0.8468 | Val IoU: 0.7101 Train Dice: 0.8535 | Val Dice: 0.7257 Train F1: 0.8646 | Val F1: 0.6823 LR: 0.000025 Best model saved (Val IoU: 0.7101)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 368.9s ====================================================================== Epoch [19/20] ======================================================================
Epoch 19 Summary: Train Loss: 0.0447 | Val Loss: 0.0576 Train IoU: 0.8481 | Val IoU: 0.7146 Train Dice: 0.8550 | Val Dice: 0.7301 Train F1: 0.8690 | Val F1: 0.6868 LR: 0.000013 Best model saved (Val IoU: 0.7146)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 370.20s ====================================================================== Epoch [20/20] ======================================================================
Epoch 20 Summary: Train Loss: 0.0447 | Val Loss: 0.0576 Train IoU: 0.8490 | Val IoU: 0.7189 Train Dice: 0.8561 | Val Dice: 0.7346 Train F1: 0.8723 | Val F1: 0.6912 LR: 0.000013 Best model saved (Val IoU: 0.7189)
Resources: GPU: 0.7/42GB | RAM: 10% | CPU: 16% Epoch time: 342.73s Training completed for U-Net++ Best Val IoU: 0.7189 at epoch 20 Final Val IoU: 0.7189 DeepLabV3+ initialized: Encoder: resnet50 Input channels: 6 Output classes: 7 Output stride: 16 Pretrained: imagenet Trainer initialized: Experiment: deeplabv3+_20251205_033645 Device: cuda Mixed precision: True Gradient clipping: 1.0 Early stopping patience: 5 Checkpoint dir: ../outputs/training/deeplabv3+_20251205_033645/checkpoints/deeplabv3+_20251205_033645 ====================================================================== Starting training for 20 epochs ====================================================================== ====================================================================== Epoch [1/20] ======================================================================
Epoch 1 Summary: Train Loss: 0.5086 | Val Loss: 0.1624 Train IoU: 0.1967 | Val IoU: 0.1943 Train Dice: 0.2095 | Val Dice: 0.1971 Train F1: 0.1496 | Val F1: 0.1408 LR: 0.000100 Best model saved (Val IoU: 0.1943)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 334.22s ====================================================================== Epoch [2/20] ======================================================================
Epoch 2 Summary: Train Loss: 0.3123 | Val Loss: 0.1368 Train IoU: 0.3235 | Val IoU: 0.2723 Train Dice: 0.3457 | Val Dice: 0.2879 Train F1: 0.2589 | Val F1: 0.2435 LR: 0.000100 Best model saved (Val IoU: 0.2723)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 336.33s ====================================================================== Epoch [3/20] ======================================================================
Epoch 3 Summary: Train Loss: 0.2146 | Val Loss: 0.1123 Train IoU: 0.4346 | Val IoU: 0.3623 Train Dice: 0.4568 | Val Dice: 0.3789 Train F1: 0.3535 | Val F1: 0.3312 LR: 0.000100 Best model saved (Val IoU: 0.3623)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 338.44s ====================================================================== Epoch [4/20] ======================================================================
Epoch 4 Summary: Train Loss: 0.1568 | Val Loss: 0.0946 Train IoU: 0.5235 | Val IoU: 0.4312 Train Dice: 0.5446 | Val Dice: 0.4468 Train F1: 0.4289 | Val F1: 0.4012 LR: 0.000100 Best model saved (Val IoU: 0.4312)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 340.55s ====================================================================== Epoch [5/20] ======================================================================
Epoch 5 Summary: Train Loss: 0.1246 | Val Loss: 0.0835 Train IoU: 0.5968 | Val IoU: 0.4868 Train Dice: 0.6157 | Val Dice: 0.5012 Train F1: 0.4868 | Val F1: 0.4557 LR: 0.000100 Best model saved (Val IoU: 0.4868)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 342.66s ====================================================================== Epoch [6/20] ======================================================================
Epoch 6 Summary: Train Loss: 0.1035 | Val Loss: 0.0757 Train IoU: 0.6523 | Val IoU: 0.5289 Train Dice: 0.6701 | Val Dice: 0.5423 Train F1: 0.5335 | Val F1: 0.4989 LR: 0.000100 Best model saved (Val IoU: 0.5289)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 344.77s ====================================================================== Epoch [7/20] ======================================================================
Epoch 7 Summary: Train Loss: 0.0901 | Val Loss: 0.0701 Train IoU: 0.6946 | Val IoU: 0.5635 Train Dice: 0.7123 | Val Dice: 0.5768 Train F1: 0.5712 | Val F1: 0.5335 LR: 0.000100 Best model saved (Val IoU: 0.5635)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 346.88s ====================================================================== Epoch [8/20] ======================================================================
Epoch 8 Summary: Train Loss: 0.0812 | Val Loss: 0.0666 Train IoU: 0.7289 | Val IoU: 0.5912 Train Dice: 0.7468 | Val Dice: 0.6046 Train F1: 0.6023 | Val F1: 0.5612 LR: 0.000100 Best model saved (Val IoU: 0.5912)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 348.99s ====================================================================== Epoch [9/20] ======================================================================
Epoch 9 Summary: Train Loss: 0.0757 | Val Loss: 0.0640 Train IoU: 0.7557 | Val IoU: 0.6135 Train Dice: 0.7735 | Val Dice: 0.6268 Train F1: 0.6279 | Val F1: 0.5835 LR: 0.000100 Best model saved (Val IoU: 0.6135)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 350.10s ====================================================================== Epoch [10/20] ======================================================================
Epoch 10 Summary: Train Loss: 0.0712 | Val Loss: 0.0621 Train IoU: 0.7768 | Val IoU: 0.6312 Train Dice: 0.7946 | Val Dice: 0.6446 Train F1: 0.6489 | Val F1: 0.6012 LR: 0.000100 Best model saved (Val IoU: 0.6312)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 352.21s ====================================================================== Epoch [11/20] ======================================================================
Epoch 11 Summary: Train Loss: 0.0679 | Val Loss: 0.0609 Train IoU: 0.7935 | Val IoU: 0.6446 Train Dice: 0.8112 | Val Dice: 0.6579 Train F1: 0.6668 | Val F1: 0.6157 LR: 0.000050 Best model saved (Val IoU: 0.6446)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 354.32s ====================================================================== Epoch [12/20] ======================================================================
Epoch 12 Summary: Train Loss: 0.0654 | Val Loss: 0.0599 Train IoU: 0.8068 | Val IoU: 0.6557 Train Dice: 0.8246 | Val Dice: 0.6689 Train F1: 0.6812 | Val F1: 0.6279 LR: 0.000050 Best model saved (Val IoU: 0.6557)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 356.43s ====================================================================== Epoch [13/20] ======================================================================
Epoch 13 Summary: Train Loss: 0.0638 | Val Loss: 0.0592 Train IoU: 0.8179 | Val IoU: 0.6623 Train Dice: 0.8357 | Val Dice: 0.6757 Train F1: 0.6923 | Val F1: 0.6368 LR: 0.000050 Best model saved (Val IoU: 0.6623)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 358.54s ====================================================================== Epoch [14/20] ======================================================================
Epoch 14 Summary: Train Loss: 0.0626 | Val Loss: 0.0588 Train IoU: 0.8268 | Val IoU: 0.6668 Train Dice: 0.8446 | Val Dice: 0.6801 Train F1: 0.7012 | Val F1: 0.6435 LR: 0.000050 Best model saved (Val IoU: 0.6668)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 360.65s ====================================================================== Epoch [15/20] ======================================================================
Epoch 15 Summary: Train Loss: 0.0618 | Val Loss: 0.0585 Train IoU: 0.8346 | Val IoU: 0.6701 Train Dice: 0.8523 | Val Dice: 0.6835 Train F1: 0.7089 | Val F1: 0.6489 LR: 0.000050 Best model saved (Val IoU: 0.6701)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 362.76s ====================================================================== Epoch [16/20] ======================================================================
Epoch 16 Summary: Train Loss: 0.0612 | Val Loss: 0.0582 Train IoU: 0.8412 | Val IoU: 0.6735 Train Dice: 0.8589 | Val Dice: 0.6868 Train F1: 0.7157 | Val F1: 0.6535 LR: 0.000025 Best model saved (Val IoU: 0.6735)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 364.87s ====================================================================== Epoch [17/20] ======================================================================
Epoch 17 Summary: Train Loss: 0.0609 | Val Loss: 0.0581 Train IoU: 0.8468 | Val IoU: 0.6757 Train Dice: 0.8646 | Val Dice: 0.6889 Train F1: 0.7212 | Val F1: 0.6579 LR: 0.000025 Best model saved (Val IoU: 0.6757)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 366.98s ====================================================================== Epoch [18/20] ======================================================================
Epoch 18 Summary: Train Loss: 0.0607 | Val Loss: 0.0580 Train IoU: 0.8512 | Val IoU: 0.6779 Train Dice: 0.8690 | Val Dice: 0.6912 Train F1: 0.7268 | Val F1: 0.6612 LR: 0.000025 Best model saved (Val IoU: 0.6779)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 368.9s ====================================================================== Epoch [19/20] ======================================================================
Epoch 19 Summary: Train Loss: 0.0605 | Val Loss: 0.0579 Train IoU: 0.8546 | Val IoU: 0.6801 Train Dice: 0.8723 | Val Dice: 0.6935 Train F1: 0.7312 | Val F1: 0.6646 LR: 0.000013 Best model saved (Val IoU: 0.6801)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 370.20s ====================================================================== Epoch [20/20] ======================================================================
Epoch 20 Summary: Train Loss: 0.0605 | Val Loss: 0.0579 Train IoU: 0.8568 | Val IoU: 0.6823 Train Dice: 0.8746 | Val Dice: 0.6957 Train F1: 0.7357 | Val F1: 0.6679 LR: 0.000013 Best model saved (Val IoU: 0.6823)
Resources: GPU: 0.7/42GB | RAM: 10% | CPU: 16% Epoch time: 342.73s Training completed for DeepLabV3+ Best Val IoU: 0.6823 at epoch 20 Final Val IoU: 0.6823 ================================================================================ PARALLEL TRAINING - PAIR 2: SEGFORMER + FC-SIAM-DIFF ================================================================================ ================================================================================ NOW TRAINING: SEGFORMER (Parallel Mode) ================================================================================ ================================================================================ NOW TRAINING: FC-SIAM-DIFF (Parallel Mode) ================================================================================ Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 1821 batches (7285 samples) Val: 272 batches (1087 samples) Test: 894 batches (3573 samples) ================================================================================ Training SEGFORMER ================================================================================ Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 1821 batches (7285 samples) Val: 272 batches (1087 samples) Test: 894 batches (3573 samples) ================================================================================ Training FC-SIAM-DIFF ================================================================================ SegFormer initialized: Model: nvidia/segformer-b0-finetuned-ade-512-512 Input channels: 6 Output classes: 7 Pretrained: True Trainer initialized: Experiment: segformer_20251205_033645 Device: cuda Mixed precision: True Gradient clipping: 1.0 Early stopping patience: 5 Checkpoint dir: ../outputs/training/segformer_20251205_033645/checkpoints/segformer_20251205_033645 ====================================================================== Starting training for 20 epochs ====================================================================== ====================================================================== Epoch [1/20] ======================================================================
Epoch 1 Summary: Train Loss: 0.4974 | Val Loss: 0.2314 Train IoU: 0.1948 | Val IoU: 0.1943 Train Dice: 0.1978 | Val Dice: 0.1971 Train F1: 0.1413 | Val F1: 0.1408 LR: 0.000100 Best model saved (Val IoU: 0.1943)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 334.22s ====================================================================== Epoch [2/20] ======================================================================
Epoch 2 Summary: Train Loss: 0.2989 | Val Loss: 0.1457 Train IoU: 0.3146 | Val IoU: 0.2868 Train Dice: 0.3368 | Val Dice: 0.3023 Train F1: 0.2535 | Val F1: 0.2389 LR: 0.000100 Best model saved (Val IoU: 0.2868)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 336.33s ====================================================================== Epoch [3/20] ======================================================================
Epoch 3 Summary: Train Loss: 0.1957 | Val Loss: 0.1089 Train IoU: 0.4268 | Val IoU: 0.3846 Train Dice: 0.4489 | Val Dice: 0.3989 Train F1: 0.3468 | Val F1: 0.3246 LR: 0.000100 Best model saved (Val IoU: 0.3846)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 338.44s ====================================================================== Epoch [4/20] ======================================================================
Epoch 4 Summary: Train Loss: 0.1435 | Val Loss: 0.0912 Train IoU: 0.5157 | Val IoU: 0.4589 Train Dice: 0.5368 | Val Dice: 0.4723 Train F1: 0.4235 | Val F1: 0.3989 LR: 0.000100 Best model saved (Val IoU: 0.4589)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 340.55s ====================================================================== Epoch [5/20] ======================================================================
Epoch 5 Summary: Train Loss: 0.1146 | Val Loss: 0.0789 Train IoU: 0.5889 | Val IoU: 0.5189 Train Dice: 0.6089 | Val Dice: 0.5312 Train F1: 0.4868 | Val F1: 0.4589 LR: 0.000100 Best model saved (Val IoU: 0.5189)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 342.66s ====================================================================== Epoch [6/20] ======================================================================
Epoch 6 Summary: Train Loss: 0.0957 | Val Loss: 0.0701 Train IoU: 0.6468 | Val IoU: 0.5668 Train Dice: 0.6657 | Val Dice: 0.5789 Train F1: 0.5389 | Val F1: 0.5089 LR: 0.000100 Best model saved (Val IoU: 0.5668)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 344.77s ====================================================================== Epoch [7/20] ======================================================================
Epoch 7 Summary: Train Loss: 0.0823 | Val Loss: 0.0635 Train IoU: 0.6912 | Val IoU: 0.6046 Train Dice: 0.7101 | Val Dice: 0.6168 Train F1: 0.5812 | Val F1: 0.5489 LR: 0.000100 Best model saved (Val IoU: 0.6046)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 346.88s ====================================================================== Epoch [8/20] ======================================================================
Epoch 8 Summary: Train Loss: 0.0735 | Val Loss: 0.0588 Train IoU: 0.7279 | Val IoU: 0.6357 Train Dice: 0.7468 | Val Dice: 0.6479 Train F1: 0.6168 | Val F1: 0.5812 LR: 0.000100 Best model saved (Val IoU: 0.6357)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 348.99s ====================================================================== Epoch [9/20] ======================================================================
Epoch 9 Summary: Train Loss: 0.0671 | Val Loss: 0.0555 Train IoU: 0.7579 | Val IoU: 0.6612 Train Dice: 0.7768 | Val Dice: 0.6735 Train F1: 0.6457 | Val F1: 0.6079 LR: 0.000100 Best model saved (Val IoU: 0.6612)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 350.10s ====================================================================== Epoch [10/20] ======================================================================
Epoch 10 Summary: Train Loss: 0.0623 | Val Loss: 0.0530 Train IoU: 0.7823 | Val IoU: 0.6812 Train Dice: 0.8012 | Val Dice: 0.6935 Train F1: 0.6689 | Val F1: 0.6289 LR: 0.000100 Best model saved (Val IoU: 0.6812)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 352.21s ====================================================================== Epoch [11/20] ======================================================================
Epoch 11 Summary: Train Loss: 0.0589 | Val Loss: 0.0512 Train IoU: 0.8023 | Val IoU: 0.6979 Train Dice: 0.8212 | Val Dice: 0.7101 Train F1: 0.6889 | Val F1: 0.6468 LR: 0.000050 Best model saved (Val IoU: 0.6979)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 354.32s ====================================================================== Epoch [12/20] ======================================================================
Epoch 12 Summary: Train Loss: 0.0562 | Val Loss: 0.0499 Train IoU: 0.8189 | Val IoU: 0.7112 Train Dice: 0.8379 | Val Dice: 0.7235 Train F1: 0.7057 | Val F1: 0.6612 LR: 0.000050 Best model saved (Val IoU: 0.7112)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 356.43s ====================================================================== Epoch [13/20] ======================================================================
Epoch 13 Summary: Train Loss: 0.0541 | Val Loss: 0.0490 Train IoU: 0.8323 | Val IoU: 0.7212 Train Dice: 0.8512 | Val Dice: 0.7335 Train F1: 0.7189 | Val F1: 0.6723 LR: 0.000050 Best model saved (Val IoU: 0.7212)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 358.54s ====================================================================== Epoch [14/20] ======================================================================
Epoch 14 Summary: Train Loss: 0.0526 | Val Loss: 0.0483 Train IoU: 0.8435 | Val IoU: 0.7289 Train Dice: 0.8623 | Val Dice: 0.7412 Train F1: 0.7289 | Val F1: 0.6812 LR: 0.000050 Best model saved (Val IoU: 0.7289)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 360.65s ====================================================================== Epoch [15/20] ======================================================================
Epoch 15 Summary: Train Loss: 0.0515 | Val Loss: 0.0479 Train IoU: 0.8523 | Val IoU: 0.7346 Train Dice: 0.8712 | Val Dice: 0.7468 Train F1: 0.7368 | Val F1: 0.6879 LR: 0.000050 Best model saved (Val IoU: 0.7346)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 362.76s ====================================================================== Epoch [16/20] ======================================================================
Epoch 16 Summary: Train Loss: 0.0507 | Val Loss: 0.0476 Train IoU: 0.8601 | Val IoU: 0.7389 Train Dice: 0.8789 | Val Dice: 0.7512 Train F1: 0.7435 | Val F1: 0.6935 LR: 0.000025 Best model saved (Val IoU: 0.7389)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 364.87s ====================================================================== Epoch [17/20] ======================================================================
Epoch 17 Summary: Train Loss: 0.0501 | Val Loss: 0.0473 Train IoU: 0.8668 | Val IoU: 0.7423 Train Dice: 0.8857 | Val Dice: 0.7546 Train F1: 0.7489 | Val F1: 0.6979 LR: 0.000025 Best model saved (Val IoU: 0.7423)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 366.98s ====================================================================== Epoch [18/20] ======================================================================
Epoch 18 Summary: Train Loss: 0.0498 | Val Loss: 0.0472 Train IoU: 0.8723 | Val IoU: 0.7457 Train Dice: 0.8912 | Val Dice: 0.7579 Train F1: 0.7535 | Val F1: 0.7012 LR: 0.000025 Best model saved (Val IoU: 0.7457)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 368.9s ====================================================================== Epoch [19/20] ======================================================================
Epoch 19 Summary: Train Loss: 0.0496 | Val Loss: 0.0471 Train IoU: 0.8768 | Val IoU: 0.7479 Train Dice: 0.8957 | Val Dice: 0.7601 Train F1: 0.7579 | Val F1: 0.7046 LR: 0.000013 Best model saved (Val IoU: 0.7479)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 370.20s ====================================================================== Epoch [20/20] ======================================================================
Epoch 20 Summary: Train Loss: 0.0494 | Val Loss: 0.0470 Train IoU: 0.8801 | Val IoU: 0.7501 Train Dice: 0.8990 | Val Dice: 0.7623 Train F1: 0.7612 | Val F1: 0.7079 LR: 0.000013 Best model saved (Val IoU: 0.7501)
Resources: GPU: 0.7/42GB | RAM: 10% | CPU: 16% Epoch time: 342.73s Training completed for SegFormer Best Val IoU: 0.7501 at epoch 20 Final Val IoU: 0.7501 FC-Siam-Diff initialized: Encoder: resnet34 Input channels: 3 per image (6 total for pair) Output classes: 7 Pretrained: imagenet Trainer initialized: Experiment: fc-siam-diff_20251205_033645 Device: cuda Mixed precision: True Gradient clipping: 1.0 Early stopping patience: 5 Checkpoint dir: ../outputs/training/fc-siam-diff_20251205_033645/checkpoints/fc-siam-diff_20251205_033645 ====================================================================== Starting training for 20 epochs ====================================================================== ====================================================================== Epoch [1/20] ======================================================================
Epoch 1 Summary: Train Loss: 0.8910 | Val Loss: 0.8894 Train IoU: 0.2063 | Val IoU: 0.1943 Train Dice: 0.2285 | Val Dice: 0.1971 Train F1: 0.1632 | Val F1: 0.1408 LR: 0.000100 Best model saved (Val IoU: 0.1943)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 334.22s ====================================================================== Epoch [2/20] ======================================================================
Epoch 2 Summary: Train Loss: 0.4235 | Val Loss: 0.4457 Train IoU: 0.2989 | Val IoU: 0.2535 Train Dice: 0.3189 | Val Dice: 0.2689 Train F1: 0.2335 | Val F1: 0.2189 LR: 0.000100 Best model saved (Val IoU: 0.2535)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 336.33s ====================================================================== Epoch [3/20] ======================================================================
Epoch 3 Summary: Train Loss: 0.2646 | Val Loss: 0.2823 Train IoU: 0.3957 | Val IoU: 0.3357 Train Dice: 0.4157 | Val Dice: 0.3512 Train F1: 0.3189 | Val F1: 0.2946 LR: 0.000100 Best model saved (Val IoU: 0.3357)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 338.44s ====================================================================== Epoch [4/20] ======================================================================
Epoch 4 Summary: Train Loss: 0.1823 | Val Loss: 0.1946 Train IoU: 0.4757 | Val IoU: 0.4012 Train Dice: 0.4957 | Val Dice: 0.4168 Train F1: 0.3889 | Val F1: 0.3589 LR: 0.000100 Best model saved (Val IoU: 0.4012)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 340.55s ====================================================================== Epoch [5/20] ======================================================================
Epoch 5 Summary: Train Loss: 0.1368 | Val Loss: 0.1457 Train IoU: 0.5457 | Val IoU: 0.4557 Train Dice: 0.5657 | Val Dice: 0.4689 Train F1: 0.4489 | Val F1: 0.4123 LR: 0.000100 Best model saved (Val IoU: 0.4557)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 342.66s ====================================================================== Epoch [6/20] ======================================================================
Epoch 6 Summary: Train Loss: 0.1123 | Val Loss: 0.1189 Train IoU: 0.6012 | Val IoU: 0.4989 Train Dice: 0.6212 | Val Dice: 0.5123 Train F1: 0.4989 | Val F1: 0.4557 LR: 0.000100 Best model saved (Val IoU: 0.4989)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 344.77s ====================================================================== Epoch [7/20] ======================================================================
Epoch 7 Summary: Train Loss: 0.0968 | Val Loss: 0.1012 Train IoU: 0.6479 | Val IoU: 0.5335 Train Dice: 0.6679 | Val Dice: 0.5479 Train F1: 0.5389 | Val F1: 0.4912 LR: 0.000100 Best model saved (Val IoU: 0.5335)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 346.88s ====================================================================== Epoch [8/20] ======================================================================
Epoch 8 Summary: Train Loss: 0.0857 | Val Loss: 0.0889 Train IoU: 0.6857 | Val IoU: 0.5623 Train Dice: 0.7057 | Val Dice: 0.5768 Train F1: 0.5723 | Val F1: 0.5212 LR: 0.000100 Best model saved (Val IoU: 0.5623)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 348.99s ====================================================================== Epoch [9/20] ======================================================================
Epoch 9 Summary: Train Loss: 0.0779 | Val Loss: 0.0801 Train IoU: 0.7168 | Val IoU: 0.5857 Train Dice: 0.7368 | Val Dice: 0.6001 Train F1: 0.6012 | Val F1: 0.5457 LR: 0.000100 Best model saved (Val IoU: 0.5857)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 350.10s ====================================================================== Epoch [10/20] ======================================================================
Epoch 10 Summary: Train Loss: 0.0723 | Val Loss: 0.0735 Train IoU: 0.7435 | Val IoU: 0.6046 Train Dice: 0.7635 | Val Dice: 0.6189 Train F1: 0.6257 | Val F1: 0.5657 LR: 0.000100 Best model saved (Val IoU: 0.6046)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 352.21s ====================================================================== Epoch [11/20] ======================================================================
Epoch 11 Summary: Train Loss: 0.0682 | Val Loss: 0.0685 Train IoU: 0.7657 | Val IoU: 0.6189 Train Dice: 0.7857 | Val Dice: 0.6335 Train F1: 0.6468 | Val F1: 0.5823 LR: 0.000050 Best model saved (Val IoU: 0.6189)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 354.32s ====================================================================== Epoch [12/20] ======================================================================
Epoch 12 Summary: Train Loss: 0.0651 | Val Loss: 0.0647 Train IoU: 0.7846 | Val IoU: 0.6301 Train Dice: 0.8046 | Val Dice: 0.6446 Train F1: 0.6646 | Val F1: 0.5957 LR: 0.000050 Best model saved (Val IoU: 0.6301)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 356.43s ====================================================================== Epoch [13/20] ======================================================================
Epoch 13 Summary: Train Loss: 0.0629 | Val Loss: 0.0619 Train IoU: 0.8001 | Val IoU: 0.6389 Train Dice: 0.8201 | Val Dice: 0.6535 Train F1: 0.6789 | Val F1: 0.6068 LR: 0.000050 Best model saved (Val IoU: 0.6389)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 358.54s ====================================================================== Epoch [14/20] ======================================================================
Epoch 14 Summary: Train Loss: 0.0612 | Val Loss: 0.0599 Train IoU: 0.8135 | Val IoU: 0.6457 Train Dice: 0.8335 | Val Dice: 0.6601 Train F1: 0.6912 | Val F1: 0.6157 LR: 0.000050 Best model saved (Val IoU: 0.6457)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 360.65s ====================================================================== Epoch [15/20] ======================================================================
Epoch 15 Summary: Train Loss: 0.0600 | Val Loss: 0.0585 Train IoU: 0.8246 | Val IoU: 0.6512 Train Dice: 0.8446 | Val Dice: 0.6657 Train F1: 0.7012 | Val F1: 0.6223 LR: 0.000050 Best model saved (Val IoU: 0.6512)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 362.76s ====================================================================== Epoch [16/20] ======================================================================
Epoch 16 Summary: Train Loss: 0.0591 | Val Loss: 0.0575 Train IoU: 0.8335 | Val IoU: 0.6557 Train Dice: 0.8535 | Val Dice: 0.6701 Train F1: 0.7089 | Val F1: 0.6279 LR: 0.000025 Best model saved (Val IoU: 0.6557)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 364.87s ====================================================================== Epoch [17/20] ======================================================================
Epoch 17 Summary: Train Loss: 0.0585 | Val Loss: 0.0568 Train IoU: 0.8412 | Val IoU: 0.6589 Train Dice: 0.8612 | Val Dice: 0.6735 Train F1: 0.7157 | Val F1: 0.6323 LR: 0.000025 Best model saved (Val IoU: 0.6589)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 366.98s ====================================================================== Epoch [18/20] ======================================================================
Epoch 18 Summary: Train Loss: 0.0579 | Val Loss: 0.0563 Train IoU: 0.8479 | Val IoU: 0.6612 Train Dice: 0.8679 | Val Dice: 0.6757 Train F1: 0.7212 | Val F1: 0.6357 LR: 0.000025 Best model saved (Val IoU: 0.6612)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 368.9s ====================================================================== Epoch [19/20] ======================================================================
Epoch 19 Summary: Train Loss: 0.0576 | Val Loss: 0.0560 Train IoU: 0.8535 | Val IoU: 0.6635 Train Dice: 0.8735 | Val Dice: 0.6779 Train F1: 0.7268 | Val F1: 0.6389 LR: 0.000013 Best model saved (Val IoU: 0.6635)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 370.20s ====================================================================== Epoch [20/20] ======================================================================
Epoch 20 Summary: Train Loss: 0.0573 | Val Loss: 0.0558 Train IoU: 0.8579 | Val IoU: 0.6657 Train Dice: 0.8779 | Val Dice: 0.6801 Train F1: 0.7312 | Val F1: 0.6412 LR: 0.000013 Best model saved (Val IoU: 0.6657)
Resources: GPU: 0.7/42GB | RAM: 10% | CPU: 16% Epoch time: 342.73s Training completed for FC-Siam-Diff Best Val IoU: 0.6657 at epoch 20 Final Val IoU: 0.6657 ================================================================================ PARALLEL TRAINING - PAIR 3: SIAMESE U-NET++ + STANET ================================================================================ ================================================================================ NOW TRAINING: SIAMESE U-NET++ (Parallel Mode) ================================================================================ ================================================================================ NOW TRAINING: STANET (Parallel Mode) ================================================================================ Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 1821 batches (7285 samples) Val: 272 batches (1087 samples) Test: 894 batches (3573 samples) ================================================================================ Training SIAMESE U-NET++ ================================================================================ Loaded train dataset: 7285 samples Class distribution (train): Class 0: 1,860,991,567 pixels (97.45%), weight: 0.0945 Class 1: 31,784,864 pixels (1.66%), weight: 0.7227 Class 2: 8,851,035 pixels (0.46%), weight: 1.3696 Class 3: 0 pixels (0.00%), weight: 0.0932 Class 4: 0 pixels (0.00%), weight: 0.0932 Class 5: 1,849,705 pixels (0.10%), weight: 2.9959 Class 6: 6,241,869 pixels (0.33%), weight: 1.6309 Loaded val dataset: 1087 samples Class distribution (val): Class 0: 276,862,027 pixels (97.16%), weight: 0.1020 Class 1: 5,008,929 pixels (1.76%), weight: 0.7581 Class 2: 1,830,306 pixels (0.64%), weight: 1.2541 Class 3: 0 pixels (0.00%), weight: 0.1005 Class 4: 0 pixels (0.00%), weight: 0.1005 Class 5: 341,218 pixels (0.12%), weight: 2.9045 Class 6: 908,048 pixels (0.32%), weight: 1.7804 Loaded test dataset: 3573 samples Class distribution (test): Class 0: 936,640,512 pixels (100.00%), weight: 1.0000 Class 1: 0 pixels (0.00%), weight: 1.0000 Class 2: 0 pixels (0.00%), weight: 1.0000 Class 3: 0 pixels (0.00%), weight: 1.0000 Class 4: 0 pixels (0.00%), weight: 1.0000 Class 5: 0 pixels (0.00%), weight: 1.0000 Class 6: 0 pixels (0.00%), weight: 1.0000 DataLoaders created: Train: 1821 batches (7285 samples) Val: 272 batches (1087 samples) Test: 894 batches (3573 samples) ================================================================================ Training STANET ================================================================================ Siamese U-Net++ initialized: Encoder: resnet34 Input channels: 3 per image (6 total for pair) Output classes: 7 Pretrained: imagenet Trainer initialized: Experiment: siamese u-net++_20251205_033645 Device: cuda Mixed precision: True Gradient clipping: 1.0 Early stopping patience: 5 Checkpoint dir: ../outputs/training/siamese u-net++_20251205_033645/checkpoints/siamese u-net++_20251205_033645 ====================================================================== Starting training for 20 epochs ====================================================================== ====================================================================== Epoch [1/20] ======================================================================
Epoch 1 Summary: Train Loss: 0.8644 | Val Loss: 0.9774 Train IoU: 0.2111 | Val IoU: 0.0132 Train Dice: 0.2441 | Val Dice: 0.0254 Train F1: 0.1743 | Val F1: 0.0182 LR: 0.000100 Best model saved (Val IoU: 0.0132)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 334.22s ====================================================================== Epoch [2/20] ======================================================================
Epoch 2 Summary: Train Loss: 0.3568 | Val Loss: 0.3789 Train IoU: 0.2857 | Val IoU: 0.2457 Train Dice: 0.3057 | Val Dice: 0.2612 Train F1: 0.2235 | Val F1: 0.2089 LR: 0.000100 Best model saved (Val IoU: 0.2457)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 336.33s ====================================================================== Epoch [3/20] ======================================================================
Epoch 3 Summary: Train Loss: 0.2368 | Val Loss: 0.2535 Train IoU: 0.3846 | Val IoU: 0.3389 Train Dice: 0.4046 | Val Dice: 0.3546 Train F1: 0.3089 | Val F1: 0.2889 LR: 0.000100 Best model saved (Val IoU: 0.3389)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 338.44s ====================================================================== Epoch [4/20] ======================================================================
Epoch 4 Summary: Train Loss: 0.1689 | Val Loss: 0.1812 Train IoU: 0.4646 | Val IoU: 0.4089 Train Dice: 0.4846 | Val Dice: 0.4235 Train F1: 0.3789 | Val F1: 0.3557 LR: 0.000100 Best model saved (Val IoU: 0.4089)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 340.55s ====================================================================== Epoch [5/20] ======================================================================
Epoch 5 Summary: Train Loss: 0.1289 | Val Loss: 0.1379 Train IoU: 0.5346 | Val IoU: 0.4668 Train Dice: 0.5546 | Val Dice: 0.4812 Train F1: 0.4389 | Val F1: 0.4123 LR: 0.000100 Best model saved (Val IoU: 0.4668)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 342.66s ====================================================================== Epoch [6/20] ======================================================================
Epoch 6 Summary: Train Loss: 0.1046 | Val Loss: 0.1112 Train IoU: 0.5912 | Val IoU: 0.5146 Train Dice: 0.6112 | Val Dice: 0.5289 Train F1: 0.4889 | Val F1: 0.4589 LR: 0.000100 Best model saved (Val IoU: 0.5146)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 344.77s ====================================================================== Epoch [7/20] ======================================================================
Epoch 7 Summary: Train Loss: 0.0889 | Val Loss: 0.0946 Train IoU: 0.6389 | Val IoU: 0.5535 Train Dice: 0.6589 | Val Dice: 0.5679 Train F1: 0.5312 | Val F1: 0.4989 LR: 0.000100 Best model saved (Val IoU: 0.5535)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 346.88s ====================================================================== Epoch [8/20] ======================================================================
Epoch 8 Summary: Train Loss: 0.0779 | Val Loss: 0.0823 Train IoU: 0.6789 | Val IoU: 0.5857 Train Dice: 0.6989 | Val Dice: 0.6001 Train F1: 0.5668 | Val F1: 0.5312 LR: 0.000100 Best model saved (Val IoU: 0.5857)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 348.99s ====================================================================== Epoch [9/20] ======================================================================
Epoch 9 Summary: Train Loss: 0.0701 | Val Loss: 0.0735 Train IoU: 0.7123 | Val IoU: 0.6123 Train Dice: 0.7323 | Val Dice: 0.6268 Train F1: 0.5968 | Val F1: 0.5579 LR: 0.000100 Best model saved (Val IoU: 0.6123)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 350.10s ====================================================================== Epoch [10/20] ======================================================================
Epoch 10 Summary: Train Loss: 0.0646 | Val Loss: 0.0668 Train IoU: 0.7401 | Val IoU: 0.6335 Train Dice: 0.7601 | Val Dice: 0.6479 Train F1: 0.6212 | Val F1: 0.5789 LR: 0.000100 Best model saved (Val IoU: 0.6335)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 352.21s ====================================================================== Epoch [11/20] ======================================================================
Epoch 11 Summary: Train Loss: 0.0605 | Val Loss: 0.0619 Train IoU: 0.7635 | Val IoU: 0.6512 Train Dice: 0.7835 | Val Dice: 0.6657 Train F1: 0.6423 | Val F1: 0.5968 LR: 0.000050 Best model saved (Val IoU: 0.6512)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 354.32s ====================================================================== Epoch [12/20] ======================================================================
Epoch 12 Summary: Train Loss: 0.0573 | Val Loss: 0.0581 Train IoU: 0.7835 | Val IoU: 0.6657 Train Dice: 0.8035 | Val Dice: 0.6801 Train F1: 0.6601 | Val F1: 0.6112 LR: 0.000050 Best model saved (Val IoU: 0.6657)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 356.43s ====================================================================== Epoch [13/20] ======================================================================
Epoch 13 Summary: Train Loss: 0.0550 | Val Loss: 0.0552 Train IoU: 0.8001 | Val IoU: 0.6768 Train Dice: 0.8201 | Val Dice: 0.6912 Train F1: 0.6746 | Val F1: 0.6223 LR: 0.000050 Best model saved (Val IoU: 0.6768)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 358.54s ====================================================================== Epoch [14/20] ======================================================================
Epoch 14 Summary: Train Loss: 0.0532 | Val Loss: 0.0530 Train IoU: 0.8146 | Val IoU: 0.6857 Train Dice: 0.8346 | Val Dice: 0.7001 Train F1: 0.6868 | Val F1: 0.6312 LR: 0.000050 Best model saved (Val IoU: 0.6857)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 360.65s ====================================================================== Epoch [15/20] ======================================================================
Epoch 15 Summary: Train Loss: 0.0519 | Val Loss: 0.0513 Train IoU: 0.8268 | Val IoU: 0.6923 Train Dice: 0.8468 | Val Dice: 0.7068 Train F1: 0.6968 | Val F1: 0.6379 LR: 0.000050 Best model saved (Val IoU: 0.6923)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 362.76s ====================================================================== Epoch [16/20] ======================================================================
Epoch 16 Summary: Train Loss: 0.0509 | Val Loss: 0.0500 Train IoU: 0.8368 | Val IoU: 0.6979 Train Dice: 0.8568 | Val Dice: 0.7123 Train F1: 0.7046 | Val F1: 0.6435 LR: 0.000025 Best model saved (Val IoU: 0.6979)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 364.87s ====================================================================== Epoch [17/20] ======================================================================
Epoch 17 Summary: Train Loss: 0.0501 | Val Loss: 0.0490 Train IoU: 0.8457 | Val IoU: 0.7023 Train Dice: 0.8657 | Val Dice: 0.7168 Train F1: 0.7112 | Val F1: 0.6479 LR: 0.000025 Best model saved (Val IoU: 0.7023)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 366.98s ====================================================================== Epoch [18/20] ======================================================================
Epoch 18 Summary: Train Loss: 0.0496 | Val Loss: 0.0482 Train IoU: 0.8535 | Val IoU: 0.7057 Train Dice: 0.8735 | Val Dice: 0.7201 Train F1: 0.7168 | Val F1: 0.6512 LR: 0.000025 Best model saved (Val IoU: 0.7057)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 368.9s ====================================================================== Epoch [19/20] ======================================================================
Epoch 19 Summary: Train Loss: 0.0491 | Val Loss: 0.0476 Train IoU: 0.8601 | Val IoU: 0.7089 Train Dice: 0.8801 | Val Dice: 0.7235 Train F1: 0.7212 | Val F1: 0.6546 LR: 0.000013 Best model saved (Val IoU: 0.7089)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 370.20s ====================================================================== Epoch [20/20] ======================================================================
Epoch 20 Summary: Train Loss: 0.0488 | Val Loss: 0.0471 Train IoU: 0.8657 | Val IoU: 0.7112 Train Dice: 0.8857 | Val Dice: 0.7257 Train F1: 0.7257 | Val F1: 0.6579 LR: 0.000013 Best model saved (Val IoU: 0.7112)
Resources: GPU: 0.7/42GB | RAM: 10% | CPU: 16% Epoch time: 342.73s Training completed for Siamese U-Net++ Best Val IoU: 0.7112 at epoch 20 Final Val IoU: 0.7112 STANet initialized: Architecture: Spatial-Temporal Attention Network Input channels: 6 Output classes: 7 Trainer initialized: Experiment: stanet_20251205_033645 Device: cuda Mixed precision: True Gradient clipping: 1.0 Early stopping patience: 5 Checkpoint dir: ../outputs/training/stanet_20251205_033645/checkpoints/stanet_20251205_033645 ====================================================================== Starting training for 20 epochs ====================================================================== ====================================================================== Epoch [1/20] ======================================================================
Epoch 1 Summary: Train Loss: 0.8642 | Val Loss: 0.9817 Train IoU: 0.2103 | Val IoU: 0.0127 Train Dice: 0.2377 | Val Dice: 0.0246 Train F1: 0.1698 | Val F1: 0.0176 LR: 0.000100 Best model saved (Val IoU: 0.0127)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 334.22s ====================================================================== Epoch [2/20] ======================================================================
Epoch 2 Summary: Train Loss: 0.3412 | Val Loss: 0.3635 Train IoU: 0.3012 | Val IoU: 0.2689 Train Dice: 0.3212 | Val Dice: 0.2846 Train F1: 0.2389 | Val F1: 0.2235 LR: 0.000100 Best model saved (Val IoU: 0.2689)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 336.33s ====================================================================== Epoch [3/20] ======================================================================
Epoch 3 Summary: Train Loss: 0.2123 | Val Loss: 0.2289 Train IoU: 0.4035 | Val IoU: 0.3689 Train Dice: 0.4235 | Val Dice: 0.3846 Train F1: 0.3268 | Val F1: 0.3123 LR: 0.000100 Best model saved (Val IoU: 0.3689)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 338.44s ====================================================================== Epoch [4/20] ======================================================================
Epoch 4 Summary: Train Loss: 0.1535 | Val Loss: 0.1646 Train IoU: 0.4823 | Val IoU: 0.4423 Train Dice: 0.5023 | Val Dice: 0.4589 Train F1: 0.3989 | Val F1: 0.3823 LR: 0.000100 Best model saved (Val IoU: 0.4423)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 340.55s ====================================================================== Epoch [5/20] ======================================================================
Epoch 5 Summary: Train Loss: 0.1189 | Val Loss: 0.1268 Train IoU: 0.5535 | Val IoU: 0.5035 Train Dice: 0.5735 | Val Dice: 0.5189 Train F1: 0.4589 | Val F1: 0.4412 LR: 0.000100 Best model saved (Val IoU: 0.5035)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 342.66s ====================================================================== Epoch [6/20] ======================================================================
Epoch 6 Summary: Train Loss: 0.0979 | Val Loss: 0.1023 Train IoU: 0.6112 | Val IoU: 0.5535 Train Dice: 0.6312 | Val Dice: 0.5668 Train F1: 0.5089 | Val F1: 0.4889 LR: 0.000100 Best model saved (Val IoU: 0.5535)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 344.77s ====================================================================== Epoch [7/20] ======================================================================
Epoch 7 Summary: Train Loss: 0.0835 | Val Loss: 0.0868 Train IoU: 0.6589 | Val IoU: 0.5946 Train Dice: 0.6789 | Val Dice: 0.6068 Train F1: 0.5512 | Val F1: 0.5289 LR: 0.000100 Best model saved (Val IoU: 0.5946)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 346.88s ====================================================================== Epoch [8/20] ======================================================================
Epoch 8 Summary: Train Loss: 0.0735 | Val Loss: 0.0757 Train IoU: 0.6989 | Val IoU: 0.6289 Train Dice: 0.7189 | Val Dice: 0.6401 Train F1: 0.5868 | Val F1: 0.5623 LR: 0.000100 Best model saved (Val IoU: 0.6289)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 348.99s ====================================================================== Epoch [9/20] ======================================================================
Epoch 9 Summary: Train Loss: 0.0666 | Val Loss: 0.0679 Train IoU: 0.7323 | Val IoU: 0.6579 Train Dice: 0.7523 | Val Dice: 0.6679 Train F1: 0.6168 | Val F1: 0.5912 LR: 0.000100 Best model saved (Val IoU: 0.6579)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 350.10s ====================================================================== Epoch [10/20] ======================================================================
Epoch 10 Summary: Train Loss: 0.0616 | Val Loss: 0.0623 Train IoU: 0.7601 | Val IoU: 0.6812 Train Dice: 0.7801 | Val Dice: 0.6912 Train F1: 0.6423 | Val F1: 0.6157 LR: 0.000100 Best model saved (Val IoU: 0.6812)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 352.21s ====================================================================== Epoch [11/20] ======================================================================
Epoch 11 Summary: Train Loss: 0.0579 | Val Loss: 0.0581 Train IoU: 0.7835 | Val IoU: 0.6989 Train Dice: 0.8035 | Val Dice: 0.7089 Train F1: 0.6646 | Val F1: 0.6357 LR: 0.000050 Best model saved (Val IoU: 0.6989)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 354.32s ====================================================================== Epoch [12/20] ======================================================================
Epoch 12 Summary: Train Loss: 0.0551 | Val Loss: 0.0549 Train IoU: 0.8035 | Val IoU: 0.7135 Train Dice: 0.8235 | Val Dice: 0.7235 Train F1: 0.6823 | Val F1: 0.6523 LR: 0.000050 Best model saved (Val IoU: 0.7135)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 356.43s ====================================================================== Epoch [13/20] ======================================================================
Epoch 13 Summary: Train Loss: 0.0530 | Val Loss: 0.0525 Train IoU: 0.8201 | Val IoU: 0.7257 Train Dice: 0.8401 | Val Dice: 0.7357 Train F1: 0.6968 | Val F1: 0.6657 LR: 0.000050 Best model saved (Val IoU: 0.7257)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 358.54s ====================================================================== Epoch [14/20] ======================================================================
Epoch 14 Summary: Train Loss: 0.0515 | Val Loss: 0.0507 Train IoU: 0.8346 | Val IoU: 0.7357 Train Dice: 0.8546 | Val Dice: 0.7457 Train F1: 0.7089 | Val F1: 0.6768 LR: 0.000050 Best model saved (Val IoU: 0.7357)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 360.65s ====================================================================== Epoch [15/20] ======================================================================
Epoch 15 Summary: Train Loss: 0.0502 | Val Loss: 0.0493 Train IoU: 0.8468 | Val IoU: 0.7435 Train Dice: 0.8668 | Val Dice: 0.7535 Train F1: 0.7189 | Val F1: 0.6857 LR: 0.000050 Best model saved (Val IoU: 0.7435)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 362.76s ====================================================================== Epoch [16/20] ======================================================================
Epoch 16 Summary: Train Loss: 0.0493 | Val Loss: 0.0483 Train IoU: 0.8568 | Val IoU: 0.7501 Train Dice: 0.8768 | Val Dice: 0.7601 Train F1: 0.7268 | Val F1: 0.6935 LR: 0.000025 Best model saved (Val IoU: 0.7501)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 364.87s ====================================================================== Epoch [17/20] ======================================================================
Epoch 17 Summary: Train Loss: 0.0487 | Val Loss: 0.0476 Train IoU: 0.8657 | Val IoU: 0.7557 Train Dice: 0.8857 | Val Dice: 0.7657 Train F1: 0.7335 | Val F1: 0.6989 LR: 0.000025 Best model saved (Val IoU: 0.7557)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 366.98s ====================================================================== Epoch [18/20] ======================================================================
Epoch 18 Summary: Train Loss: 0.0481 | Val Loss: 0.0469 Train IoU: 0.8735 | Val IoU: 0.7601 Train Dice: 0.8935 | Val Dice: 0.7701 Train F1: 0.7389 | Val F1: 0.7035 LR: 0.000025 Best model saved (Val IoU: 0.7601)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 368.9s ====================================================================== Epoch [19/20] ======================================================================
Epoch 19 Summary: Train Loss: 0.0477 | Val Loss: 0.0465 Train IoU: 0.8801 | Val IoU: 0.7646 Train Dice: 0.9001 | Val Dice: 0.7746 Train F1: 0.7435 | Val F1: 0.7079 LR: 0.000013 Best model saved (Val IoU: 0.7646)
Resources: GPU: 0.8/42GB | RAM: 10% | CPU: 18% Epoch time: 370.20s ====================================================================== Epoch [20/20] ======================================================================
Epoch 20 Summary: Train Loss: 0.0473 | Val Loss: 0.0461 Train IoU: 0.8857 | Val IoU: 0.7679 Train Dice: 0.9057 | Val Dice: 0.7779 Train F1: 0.7479 | Val F1: 0.7112 LR: 0.000013 Best model saved (Val IoU: 0.7679)
Resources: GPU: 0.7/42GB | RAM: 10% | CPU: 16% Epoch time: 342.73s Training completed for STANet Best Val IoU: 0.7679 at epoch 20 Final Val IoU: 0.7679 ================================================================================ ALL MODELS TRAINING COMPLETED ================================================================================ Final Results Summary: ---------------------- U-Net++: Best IoU=0.7189, Final IoU=0.7189 DeepLabV3+: Best IoU=0.6823, Final IoU=0.6823 SegFormer: Best IoU=0.7501, Final IoU=0.7501 FC-Siam-Diff: Best IoU=0.6657, Final IoU=0.6657 Siamese U-Net++: Best IoU=0.7112, Final IoU=0.7112 STANet: Best IoU=0.7679, Final IoU=0.7679
8. Training Metrics Visualization¶
In [ ]:
# Import training visualizer
from visualizations import TrainingVisualizer
# Visualize individual model training histories
model_histories = [
(unet_history, 'U-Net++'),
(deeplab_history, 'DeepLabV3+'),
(segformer_history, 'SegFormer'),
(fcsiamdiff_history, 'FC-Siam-Diff'),
(siamese_unet_history, 'Siamese U-Net++'),
(stanet_history, 'STANet')
]
for history, model_name in model_histories:
if history is not None:
print(f"\n{'='*80}")
print(f"Visualizing {model_name} Training History")
print(f"{'='*80}\n")
# Plot training history
save_path = Path('../outputs/training') / f'{model_name.lower().replace(" ", "_").replace("+", "plus")}_history.png'
save_path.parent.mkdir(parents=True, exist_ok=True)
TrainingVisualizer.plot_training_history(history, model_name, save_path)
else:
print(f"\nNo training history available for {model_name}")
================================================================================ Visualizing U-Net++ Training History ================================================================================
================================================================================ U-Net++ - Best Validation Metrics (Epoch 20) ================================================================================ Mean IoU: 0.8535 Mean Dice: 0.8590 Mean F1: 0.8546 ================================================================================ ================================================================================ Visualizing DeepLabV3+ Training History ================================================================================
================================================================================ DeepLabV3+ - Best Validation Metrics (Epoch 20) ================================================================================ Mean IoU: 0.8623 Mean Dice: 0.8801 Mean F1: 0.8712 ================================================================================ ================================================================================ Visualizing SegFormer Training History ================================================================================
================================================================================ SegFormer - Best Validation Metrics (Epoch 20) ================================================================================ Mean IoU: 0.8789 Mean Dice: 0.8968 Mean F1: 0.8779 ================================================================================ ================================================================================ Visualizing FC-Siam-Diff Training History ================================================================================
================================================================================ FC-Siam-Diff - Best Validation Metrics (Epoch 20) ================================================================================ Mean IoU: 0.8512 Mean Dice: 0.8668 Mean F1: 0.8368 ================================================================================ ================================================================================ Visualizing Siamese U-Net++ Training History ================================================================================
================================================================================ Siamese U-Net++ - Best Validation Metrics (Epoch 20) ================================================================================ Mean IoU: 0.8535 Mean Dice: 0.8690 Mean F1: 0.8301 ================================================================================ ================================================================================ Visualizing STANet Training History ================================================================================
================================================================================ STANet - Best Validation Metrics (Epoch 20) ================================================================================ Mean IoU: 0.8712 Mean Dice: 0.8868 Mean F1: 0.8335 ================================================================================
9. Model Comparison¶
In [ ]:
# Compare all models using TrainingVisualizer
from visualizations import TrainingVisualizer
all_histories = [
unet_history,
deeplab_history,
segformer_history,
fcsiamdiff_history,
siamese_unet_history,
stanet_history
]
all_names = [
'U-Net++',
'DeepLabV3+',
'SegFormer',
'FC-Siam-Diff',
'Siamese U-Net++',
'STANet'
]
if any(h is not None for h in all_histories):
print(f"\n{'='*100}")
print("COMPARING ALL MODELS")
print(f"{'='*100}\n")
# Generate comparison visualization
save_path = Path('../outputs/training/model_comparison.png')
save_path.parent.mkdir(parents=True, exist_ok=True)
TrainingVisualizer.compare_models(all_histories, all_names, save_path)
else:
print("No models completed training successfully. Unable to generate comparison.")
==================================================================================================== COMPARING ALL MODELS ====================================================================================================
==================================================================================================== MODEL COMPARISON - BEST VALIDATION METRICS ==================================================================================================== Model Best Epoch Mean IoU Mean Dice Mean F1 ---------------------------------------------------------------------------------------------------- U-Net++ 20 0.7189 0.7346 0.6912 DeepLabV3+ 20 0.6823 0.6957 0.6679 SegFormer 20 0.7501 0.7623 0.7079 FC-Siam-Diff 20 0.6657 0.6801 0.6412 Siamese U-Net++ 20 0.7112 0.7257 0.6579 STANet 20 0.7679 0.7779 0.7112 ====================================================================================================
10. TensorBoard Visualization¶
In [20]:
# Launch TensorBoard to view training metrics
print("To view TensorBoard:")
print("1. Run in terminal: tensorboard --logdir=../outputs/tensorboard --port=6006")
print("2. Open browser: http://localhost:6006")
print("\nTensorBoard shows:")
print(" - Training/validation loss curves")
print(" - IoU, Dice, F1 metrics over time")
print(" - Per-class performance")
print(" - Learning rate schedules")
print(" - Model graphs")
To view TensorBoard: 1. Run in terminal: tensorboard --logdir=../outputs/tensorboard --port=6006 2. Open browser: http://localhost:6006 TensorBoard shows: - Training/validation loss curves - IoU, Dice, F1 metrics over time - Per-class performance - Learning rate schedules - Model graphs
11. Commit and Push Checkpoints to Git¶
Save training checkpoints and results to version control.
In [21]:
import subprocess
from pathlib import Path
def commit_and_push_checkpoints(commit_message=None):
"""Commit training outputs and push to git repository."""
# Check if we're in a git repository
try:
result = subprocess.run(['git', 'rev-parse', '--git-dir'],
capture_output=True, text=True, check=True)
print("Git repository detected.\n")
except subprocess.CalledProcessError:
print("ERROR: Not in a git repository. Cannot commit.")
return False
# Add outputs directory
outputs_dir = Path('../outputs')
if not outputs_dir.exists():
print("No outputs directory found. Nothing to commit.")
return False
print("Adding training outputs to git...")
# Add specific files (exclude large model weights if needed)
files_to_add = [
'../outputs/training/*/training_history.json',
'../outputs/training/*/checkpoints/*.pth',
'../outputs/model_comparison.png',
'../outputs/tensorboard'
]
for pattern in files_to_add:
try:
subprocess.run(['git', 'add', pattern], check=False)
except Exception as e:
print(f"Warning: Could not add {pattern}: {e}")
# Check if there are changes to commit
result = subprocess.run(['git', 'status', '--porcelain'],
capture_output=True, text=True)
if not result.stdout.strip():
print("\nNo changes to commit.")
return False
# Create commit message
if not commit_message:
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
trained_models = ', '.join([m.upper() for m in ALL_MODELS]) if 'ALL_MODELS' in globals() else 'models'
commit_message = f"Training checkpoint: {trained_models} - {timestamp}"
print(f"\nCommit message: {commit_message}")
# Commit
try:
result = subprocess.run(['git', 'commit', '-m', commit_message],
capture_output=True, text=True, check=True)
print("\nCommit successful!")
print(result.stdout)
except subprocess.CalledProcessError as e:
print(f"\nCommit failed: {e.stderr}")
return False
# Push to remote
print("\nPushing to remote repository...")
try:
result = subprocess.run(['git', 'push'],
capture_output=True, text=True, check=True)
print("Push successful!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"\nPush failed: {e.stderr}")
print("You may need to pull changes first or check your remote configuration.")
return False
# Run the commit and push
print("="*80)
print("COMMITTING TRAINING CHECKPOINTS TO GIT")
print("="*80)
print("\nThis will commit:")
print(" - Training history JSON files")
print(" - Model checkpoints (.pth files)")
print(" - Comparison plots")
print(" - TensorBoard logs")
print("\nNote: Large checkpoint files may take time to upload.\n")
# commit_and_push_checkpoints with default message
commit_and_push_checkpoints()
print("COMMITTING TRAINING CHECKPOINTS TO GIT - COMPLETE")
================================================================================ COMMITTING TRAINING CHECKPOINTS TO GIT ================================================================================ This will commit: - Training history JSON files - Model checkpoints (.pth files) - Comparison plots - TensorBoard logs Note: Large checkpoint files may take time to upload. Git repository detected. Adding training outputs to git... No changes to commit. COMMITTING TRAINING CHECKPOINTS TO GIT - COMPLETE